Fix scripts

2024-08-10 18:41:03 +02:00
parent 8791ce311e
commit b97572e889
17 changed files with 356 additions and 24 deletions
--- a/avif.py
+++ b/avif.py
@@ -1,8 +0,0 @@
-import os
-import glob
-
-jpg = glob.glob(os.getcwd() + "/**/*.jpg", recursive=True)
-for e in jpg:
-    command = 'avifenc "' + e + '" -o "' + e[: len(e) - 4] + '.avif"'
-    print(command)
-    os.system(command)
--- a/hydra.py
+++ b/hydra.py
@@ -0,0 +1,290 @@
+import argparse
+import gzip
+import json
+import sys
+from concurrent import futures
+from html.parser import HTMLParser
+from http.client import IncompleteRead, InvalidURL
+from os import path
+from queue import Empty, Queue
+from socket import timeout as SocketTimeoutError
+from urllib import error, parse, request
+
+
+class Config:
+    """Handle configuration"""
+
+    def __init__(self, config_filename=""):
+        # Use these default settings if no configuration file is provided
+        self.tags = ["a", "link", "img", "script"]
+        self.attrs = ["href", "src"]
+        self.exclude_scheme_prefixes = ["tel:", "javascript:"]
+        self.threads = 50
+        self.timeout = 60
+        self.OK = [200, 999]
+        self.graceful_exit = False
+
+        if config_filename != "":
+            # Update settings if there is a config file
+            with open(config_filename, "r") as file:
+                file_text = file.read()
+                config_json = json.loads(file_text)
+                self.tags = config_json.get("tags", self.tags)
+                self.attrs = config_json.get("attrs", self.attrs)
+                self.exclude_scheme_prefixes = config_json.get(
+                    "exclude_scheme_prefixes", self.exclude_scheme_prefixes
+                )
+                self.threads = config_json.get("threads", self.threads)
+                self.timeout = config_json.get("timeout", self.timeout)
+                self.OK = config_json.get("OK", self.OK)
+                self.graceful_exit = config_json.get("graceful_exit", self.graceful_exit)
+
+    def __str__(self):
+        text = (
+            f"tags: {self.tags}"
+            f"attrs: {self.attrs}"
+            f"exclude_scheme_prefixes = {self.exclude_scheme_prefixes}"
+            f"threads = {self.threads}"
+            f"timeout = {self.timeout}"
+            f"OK = {self.OK}"
+        )
+        return text
+
+
+class Parser(HTMLParser):
+    """Parse tags found in webpages to get more links to check"""
+
+    def __init__(self, config):
+        super(Parser, self).__init__()
+        self.links = []
+        self.config = config
+
+    def handle_starttag(self, tag, attrs):
+        """Method html.parser.HTMLParser.handle_starttag"""
+        # Ignore tags we aren't configured to check
+        if tag not in self.config.tags:
+            return
+        for a in attrs:
+            # Handle attributes we want to check while ignoring schemes we don't want to check
+            # e.g. a 'href' with scheme 'tel:555...'
+            # attrs is a list of (name, value) pairs
+            if a[0] in self.config.attrs and a[1]:
+                # TODO: handle an empty attribute value
+                # Ignore schemes we aren't configured to check
+                exclude_list = [
+                    e for e in self.config.exclude_scheme_prefixes if a[1].startswith(e)
+                ]
+                if len(exclude_list) > 0:
+                    return
+                self.links.append(a[1])
+
+    def feed_me(self, data):
+        self.links = []
+        self.feed(data)
+        return self.links
+
+    def error(self, msg):
+        return msg
+
+
+def extract_domain(link):
+    """Extract domain of a link to help ensure we stay on the same website"""
+    domain = parse.urlsplit(link).netloc
+    return domain
+
+
+class Checker:
+    TO_PROCESS = Queue()
+
+    def __init__(self, url, config):
+        self.config = config
+        self.broken = []
+        self.domain = extract_domain(url)
+        self.visited = set()
+        self.mailto_links = list()
+        self.pool = futures.ThreadPoolExecutor(max_workers=self.config.threads)
+        self.report = ""
+
+    def add_entry(self, code, reason, page):
+        """Add a link to the report"""
+        if code in self.config.OK:
+            return
+        code = code
+        reason = reason
+        entry = {
+            "code": code,
+            "link": page["url"],
+            "parent": page["parent"],
+            "err": reason,
+        }
+        self.broken.append(entry)
+
+    def load_url(self, page, timeout):
+        """ Try to retrieve contents of a page and record result
+            Store the link to be checked and its parent in the result
+        """
+        result = {
+            "url": page["url"],
+            "parent": page["parent"],
+            "data": "",
+            "valid_content_type": False,
+        }
+
+        # Use GET as HEAD is frequently not allowed
+        r = request.Request(
+            page["url"],
+            headers={
+                "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0"
+            },
+        )
+
+        try:
+            http_response = request.urlopen(r, timeout=self.config.timeout)
+
+            encoding = http_response.headers.get("Content-Encoding")
+            if encoding and "gzip" in encoding:
+                data = gzip.decompress(http_response.read()).decode(
+                    encoding="utf-8", errors="ignore"
+                )
+            elif encoding is None:
+                data = http_response.read().decode(encoding="utf-8", errors="ignore")
+            else:
+                # Support for other less common directives not handled
+                raise NotImplementedError
+            result["data"] = data
+
+            content_type = http_response.headers.get("Content-Type")
+            if (
+                content_type is not None
+                and "text/html" in content_type
+                or "text/plain" in content_type
+            ):
+                valid_content_type = True
+            else:
+                valid_content_type = False
+            result["valid_content_type"] = valid_content_type
+
+        except error.HTTPError as e:
+            code = e.getcode()
+            reason = e.reason
+            self.add_entry(code, reason, page)
+            return
+        except (
+            error.URLError,
+            ConnectionRefusedError,
+            ConnectionResetError,
+            IncompleteRead,
+            InvalidURL,
+            NotImplementedError,
+            SocketTimeoutError,
+            TimeoutError,
+            TypeError,
+            UnicodeError,
+        ) as e:
+            code = 0
+            reason = e
+            self.add_entry(code, reason, page)
+            return
+        except TimeoutError as e:
+            code = 408
+            reason = e
+            self.add_entry(code, reason, page)
+            return
+
+        return result
+
+    def handle_future(self, result):
+        if result.result():
+            page = result.result()
+            self.parse_page(page)
+
+    def parse_page(self, page):
+        """Get more links from successfully retrieved pages in the same domain"""
+        if self.domain == extract_domain(page["url"]) and page["valid_content_type"]:
+            parent = page["url"]
+            parser = Parser(self.config)
+            links = parser.feed_me(page["data"])
+            new_links = [x for x in links if x not in self.visited]
+            full_links = [parse.urljoin(parent, l) for l in new_links]
+            for l in full_links:
+                if l not in self.visited:
+                    li = {"parent": parent, "url": l}
+                    self.TO_PROCESS.put(li)
+
+    def make_report(self):
+        """Parse broken links list into YAML report"""
+        self.report = "---\ntitle: Broken Link Report"
+        self.report += "\nchecked: " + str(len(self.visited))
+        self.report += "\nnumber of email links: " + str(len(self.mailto_links))
+        self.report += "\nemails: " + ", ".join(
+            [str(m) for m in set(self.mailto_links)]
+        )
+        self.report += "\nbroken: " + str(len(self.broken))
+        self.report += "\n---\n"
+        sorted_list = sorted(self.broken, key=lambda k: k["code"], reverse=True)
+        for link in sorted_list:
+            self.report += f"\n- code:    {link['code']}\n  url:     {link['link']}\n  parent:  {link['parent']}\n  error:   {link['err']}\n"
+        return self.report
+
+    def run(self):
+        """Run crawler until TO_PROCESS queue is empty"""
+        while True:
+            try:
+                target_url = self.TO_PROCESS.get(block=True, timeout=4)
+                if target_url["url"].startswith("mailto:"):
+                    email = target_url["url"][len("mailto:") :]
+                    self.mailto_links.append(email)
+
+                elif target_url["url"] not in self.visited:
+                    self.visited.add(target_url["url"])
+                    job = self.pool.submit(
+                        self.load_url, target_url, self.config.timeout
+                    )
+                    job.add_done_callback(self.handle_future)
+            except Empty:
+                return
+            except Exception as e:
+                print(e)
+
+
+def main():
+    """Validate arguments and run Hydra"""
+    parg = argparse.ArgumentParser(
+        description="Crawl a website and check for broken links.",
+        epilog="A broken links report will be output to stdout, so you may like to redirect this to a file.",
+    )
+    parg.add_argument(
+        "URL", help="The URL of the website to crawl, e.g. https://example.com"
+    )
+    parg.add_argument("--config", "-c", help="Path to a configuration file")
+    args = parg.parse_args()
+
+    # If a configuration file path was provided, ensure we can find it
+    if args.config and not path.exists(args.config):
+        print(f"Can't find {args.config} as config file.")
+        sys.exit(1)
+
+    # Ensure we have a valid URL to crawl
+    url = args.URL
+    check_url = parse.urlparse(url)
+    if check_url.scheme == "" or check_url.netloc == "":
+        print("Please provide a valid URL with scheme, e.g. https://example.com")
+        sys.exit(1)
+
+    # Configure and run Hydra
+    first_url = {"parent": url, "url": url}
+    config_file = ""  # Uses default settings if no configuration file provided
+    if args.config:
+        config_file = args.config
+    config = Config(config_file)
+    check = Checker(url, config)
+    check.TO_PROCESS.put(first_url)
+    check.run()
+    print(check.make_report())
+
+    if check.broken and not check.config.graceful_exit:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/4
+++ b/4
@@ -22,6 +22,10 @@ developall: ## Run the site localy with all the article, future or drafts
 gomodule: ## Update Go Module
 	@hugo mod get -u

+.PHONY: hydra
+hydra: ## Check links
+	@poetry run python hydra.py http://localhost:1313/ --config ./hydra-config.json
+
 .PHONY: syntax
 syntax: ## Build the style of the code
 	@hugo gen chromastyles --style=dracula > themes/fugu/assets/css/_syntax.scss
--- a/opml_to_gohugo.py
+++ b/opml_to_gohugo.py
@@ -1,4 +1,3 @@
-from pprint import pprint
 from bs4 import BeautifulSoup
 import json

--- a/poetry.lock
+++ b/poetry.lock
@@ -1,5 +1,26 @@
 # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.

+[[package]]
+name = "beautifulsoup4"
+version = "4.12.3"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.6.0"
+files = [
+    {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
+    {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "certifi"
 version = "2024.7.4"
@@ -233,6 +254,17 @@ files = [
    {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"},
 ]

+[[package]]
+name = "soupsieve"
+version = "2.5"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
 [[package]]
 name = "urllib3"
 version = "2.2.2"
@@ -253,4 +285,4 @@ zstd = ["zstandard (>=0.18.0)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.12"
-content-hash = "98d8cd3688823d624ec5b07b8cebaf83c35241dbdf069c603d76d75e393584d4"
+content-hash = "1afd2299b83d3a54d0d606ee5859efdb33f5e97ab007b8acc1cbb6a2df6ba3bb"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ python = "^3.12"
 requests = "^2.32.3"
 feedparser = "^6.0.11"
 rich = "^13.7.1"
+beautifulsoup4 = "^4.12.3"


 [build-system]
--- a/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu10644800715163427911.webp
+++ b/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu10644800715163427911.webp
--- a/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu13447934259916046219.webp
+++ b/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu13447934259916046219.webp
--- a/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu17425466325238440311.webp
+++ b/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu17425466325238440311.webp
--- a/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu8860937988823385308.webp
+++ b/resources/_gen/images/post/2021/my-mirror-kernel-panicking-now/cover_hu8860937988823385308.webp
--- a/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu18397115417137736207.webp
+++ b/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu18397115417137736207.webp
--- a/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu2166438786204009889.webp
+++ b/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu2166438786204009889.webp
--- a/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu4000827763489553958.webp
+++ b/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu4000827763489553958.webp
--- a/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu9252796573060906316.webp
+++ b/resources/_gen/images/post/2024/why-have-a-blog-in-the-fast-social-media-era/cover_hu9252796573060906316.webp
--- a/send_webmention.py
+++ b/send_webmention.py
@@ -1,5 +1,3 @@
-import mailbox
-from pip import main
 import requests
 import feedparser
 from rich.console import Console
@@ -9,21 +7,19 @@ feed_url = "http://fundor333.com/index.xml"


 def send_webmention(url: str):
-    params = {"url": url}
    # send post request to webmention
    r = requests.post(
        f"https://webmention.app/check?token=d6ecd337-f1c5-4b3f-8e82-5dc280d727fa&url={url}"
    )
    console = Console()
-    if 200 <=r.status_code<400:
+    if 200 <= r.status_code < 400:
        color = "green"
    else:
-        color="red"
+        color = "red"
    text = Text.assemble((str(r.status_code), color), f" {url}")
    console.print(text)


-
 def get_url_from_feed():
    for link in feedparser.parse(feed_url).entries:
        send_webmention(link.link)
--- a/webmention.py
+++ b/webmention.py
@@ -0,0 +1,26 @@
+import requests
+import feedparser
+from rich.console import Console
+from rich.text import Text
+
+feed_url = "http://fundor333.com/index.xml"
+
+
+def get_webmention(url_post: str):
+    url = "https://webmention.io/api/mentions.jf2"
+
+    payload = {'target[]': url_post}
+    headers = {}
+
+    response = requests.request("GET", url, headers=headers, data=payload)
+
+    print(url_post)
+    print(response.json()['children'])
+
+
+def get_url_from_feed():
+    for link in feedparser.parse(feed_url).entries:
+        get_webmention(link.link)
+
+
+get_url_from_feed()
--- a/webp.py
+++ b/webp.py
@@ -1,8 +0,0 @@
-import os
-import glob
-
-pngs = glob.glob(os.getcwd() + "/**/*.png", recursive=True)
-for e in pngs:
-    command = 'cwebp -q 100 "' + e + '" -o "' + e[: len(e) - 4] + '.webp"'
-    print(command)
-    os.system(command)