Put under version control

2021-11-29 09:16:31 +00:00 · 2021-11-29 09:16:31 +00:00 · a6af5b12d2
commit a6af5b12d2
parent d26d23348b
83 changed files with 20130 additions and 0 deletions
--- a/doc/source/config_template/initial_data/seed_urls.list
+++ b/doc/source/config_template/initial_data/seed_urls.list
@ -0,0 +1,23 @@
+# Initial URLs (first run only)
+#
+# To whitelist a URL prepend '+', to blacklist prepend '-'.
+# Comment lines must begin with '#'.
+
+# de
+http://agd.blogsport.de/
+https://blackblogs.org/blogs/
+https://fau.org/
+http://anarchiv.de/
+http://olaf.bbm.de/die-aktion
+-https://www.anarchistischefoderation.de/
+
+# en
+https://anarchistarchivist.com/
+https://bookshelf.theanarchistlibrary.org/library/
+https://archive.elephanteditions.net/library/
+https://blackrosefed.org/
+https://alpineanarchist.org/
+https://nostate.net/
+https://abolishing.blackblogs.org/
+http://library.nothingness.org/
+-https://www.anarchistfederation.net/
--- a/doc/source/config_template/main.yaml
+++ b/doc/source/config_template/main.yaml
@ -0,0 +1,88 @@
+# Name of this instance
+# Default value: atextcrawler
+# Allowed values: arbitrary string
+instance_name: atextcrawler
+
+# Which kind of instance is this?
+# Default value: prod
+# Allowed values are:
+# - 'dev': development instance
+# - 'staging': staging instance
+# - 'prod': production instance
+instance_type: prod
+
+# Log level
+# Default value: info
+# Allowed values: critical, error, warning, info, debug
+log_level: info
+
+# Plugins directory
+# If given as relative path, it will be relative to the
+# directory of this file (main.yaml).
+# Read documentation on plugins.
+# Default value: plugins
+# Hint: Create a empty __init__.py in the plugins_dir.
+plugins_dir: plugins
+
+# Parameters for access to the PostgreSQL service
+# No default values; must be set.
+postgresql:
+    host: localhost
+    port: 5432
+    database: atextcrawler
+    user: atextcrawler
+    password: ________________________
+
+# Crawling
+crawl:
+    # Number of concurrent workers
+    # Default value: 10
+    # Allowed values: integer >=0 and <=1000
+    #workers: 3
+
+    # Delay in seconds between attempts to fetch items
+    # from site_queue if the last attempt gave no item
+    # Also the delay in seconds after a worker has found
+    # no site to process
+    # Default value: 600
+    # Allowed values: positive number
+    #site_delay: 10
+
+    # Time interval in seconds between site updates when
+    # handling queued base URLs
+    # Default value: 3600
+    # Allowed values: positive number
+    #site_revisit_interval: 3600
+
+    # Delay in seconds between attempts to process
+    # individual resources (pages etc.) of a site
+    # Default value: 5
+    # Allowed values: positive number
+    #resource_delay: 3
+
+    # Default interval in seconds between full crawls of a site
+    # Default value: 864000 (10 days)
+    # Allowed values: positive number
+    #full_crawl_interval: 864000
+
+    # Default interval in seconds between feed crawls of a site
+    # Default value: 86400 (1 day)
+    # Allowed values: positive number
+    #feed_crawl_interval: 86400
+
+# Parameters for access to the ElasticSearch service
+# No default values; must be set.
+elasticsearch:
+    # host on which ES is running
+    host: localhost
+    # API key for accessing ES
+    api_key: "**********************"
+    # API user id
+    id: "**********************"
+    # Index base name (actual index names will have '_text' etc. appended)
+    index_base_name: atext
+
+# Tensorflow access
+tensorflow:
+    # The prediction endpoint of the model server's sentence model
+    model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
--- a/doc/source/config_template/plugins/init.py
+++ b/doc/source/config_template/plugins/init.py
--- a/doc/source/config_template/plugins/filter_resource_path.py
+++ b/doc/source/config_template/plugins/filter_resource_path.py
@ -0,0 +1,22 @@
+"""
+Filter paths found in a resource.
+
+This plugin implements :func:`rp_filter`.
+"""
+
+from typing import Optional
+
+
+def rp_filter(site, durl) -> Optional[str]:
+    """
+    Adjust or filter found paths (may depend on site).
+
+    To filter out a path (i.e., not add it to table `site_path`)
+    return None.
+    """
+    path = durl.pwa()
+    # skip fetching images (linked from a tags; img tags are skipped anyway)
+    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
+        return None
+    path = path.removesuffix('?amp=1')
+    return path
--- a/doc/source/config_template/plugins/filter_site.py
+++ b/doc/source/config_template/plugins/filter_site.py
@ -0,0 +1,47 @@
+"""
+Relevance estimation of sites.
+
+This plugin implements :func:`site_filter`.
+"""
+
+import re
+
+from atextcrawler.models import Site
+
+MIN_RELEVANCE_SCORE = 5
+
+
+async def site_filter(site: Site) -> bool:
+    """
+    Assess relevance of the site (using language-dependent criteria).
+
+    If the site shall be crawled, return True, else False.
+    """
+    # limit to sites in English or German language
+    if not set(['de', 'en']) & set(site.langs):
+        return False
+    score = 0.0
+    for crit_name, weight, langs, crit_re in re_criteria:
+        if '*' in langs or set(langs) & set(site.langs):
+            findings = crit_re.findall(site.startpage_text)
+            if findings:
+                score += weight * len(findings)
+            if site.title and crit_re.search(site.title):
+                score += 4 * weight
+            if site.description and crit_re.search(site.description):
+                score += 4 * weight
+
+    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
+
+    return score >= MIN_RELEVANCE_SCORE
+
+
+re_criteria = {
+    (
+        'anarch',
+        1.0,
+        ('*',),
+        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
+    ),
+    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
+}
--- a/doc/source/config_template/plugins/filter_site_path.py
+++ b/doc/source/config_template/plugins/filter_site_path.py
@ -0,0 +1,24 @@
+"""
+Plugin for filtering paths of a site to be retrieved.
+
+This plugin implements :func:`sp_filter`.
+"""
+
+
+def sp_filter(site, path, robots) -> bool:
+    """
+    Per-site path filter. Return whether the path shall be retrieved.
+    """
+    if not robots.can_fetch_url(site.base_url + path):
+        return False
+    if 'amusewiki' in site.meta_info.get('generator', '').lower():
+        if any(
+            [
+                path.endswith(end)
+                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
+            ]
+        ):
+            return False
+        if '/bbselect?' in path:
+            return False
+    return True