Put under version control

2021-11-29 09:16:31 +00:00 · 2021-11-29 09:16:31 +00:00 · a6af5b12d2
commit a6af5b12d2
parent d26d23348b
83 changed files with 20130 additions and 0 deletions
--- a/doc/source/config_template/plugins/init.py
+++ b/doc/source/config_template/plugins/init.py
--- a/doc/source/config_template/plugins/filter_resource_path.py
+++ b/doc/source/config_template/plugins/filter_resource_path.py
@ -0,0 +1,22 @@
+"""
+Filter paths found in a resource.
+
+This plugin implements :func:`rp_filter`.
+"""
+
+from typing import Optional
+
+
+def rp_filter(site, durl) -> Optional[str]:
+    """
+    Adjust or filter found paths (may depend on site).
+
+    To filter out a path (i.e., not add it to table `site_path`)
+    return None.
+    """
+    path = durl.pwa()
+    # skip fetching images (linked from a tags; img tags are skipped anyway)
+    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
+        return None
+    path = path.removesuffix('?amp=1')
+    return path
--- a/doc/source/config_template/plugins/filter_site.py
+++ b/doc/source/config_template/plugins/filter_site.py
@ -0,0 +1,47 @@
+"""
+Relevance estimation of sites.
+
+This plugin implements :func:`site_filter`.
+"""
+
+import re
+
+from atextcrawler.models import Site
+
+MIN_RELEVANCE_SCORE = 5
+
+
+async def site_filter(site: Site) -> bool:
+    """
+    Assess relevance of the site (using language-dependent criteria).
+
+    If the site shall be crawled, return True, else False.
+    """
+    # limit to sites in English or German language
+    if not set(['de', 'en']) & set(site.langs):
+        return False
+    score = 0.0
+    for crit_name, weight, langs, crit_re in re_criteria:
+        if '*' in langs or set(langs) & set(site.langs):
+            findings = crit_re.findall(site.startpage_text)
+            if findings:
+                score += weight * len(findings)
+            if site.title and crit_re.search(site.title):
+                score += 4 * weight
+            if site.description and crit_re.search(site.description):
+                score += 4 * weight
+
+    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
+
+    return score >= MIN_RELEVANCE_SCORE
+
+
+re_criteria = {
+    (
+        'anarch',
+        1.0,
+        ('*',),
+        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
+    ),
+    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
+}
--- a/doc/source/config_template/plugins/filter_site_path.py
+++ b/doc/source/config_template/plugins/filter_site_path.py
@ -0,0 +1,24 @@
+"""
+Plugin for filtering paths of a site to be retrieved.
+
+This plugin implements :func:`sp_filter`.
+"""
+
+
+def sp_filter(site, path, robots) -> bool:
+    """
+    Per-site path filter. Return whether the path shall be retrieved.
+    """
+    if not robots.can_fetch_url(site.base_url + path):
+        return False
+    if 'amusewiki' in site.meta_info.get('generator', '').lower():
+        if any(
+            [
+                path.endswith(end)
+                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
+            ]
+        ):
+            return False
+        if '/bbselect?' in path:
+            return False
+    return True