Put under version control

2021-11-29 09:16:31 +00:00 · 2021-11-29 09:16:31 +00:00 · a6af5b12d2
commit a6af5b12d2
parent d26d23348b
83 changed files with 20130 additions and 0 deletions
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -0,0 +1,71 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
+import os
+import sys
+
+proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
+sys.path.insert(0, proj_dir + '/src')
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'atextcrawler'
+copyright = '2021, ibu radempa'
+author = 'ibu radempa'
+
+# The full version, including alpha/beta/rc tags
+release = '0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'myst_parser',
+    'sphinx.ext.graphviz',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+autosummary_generate = True
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
--- a/doc/source/config_template/initial_data/seed_urls.list
+++ b/doc/source/config_template/initial_data/seed_urls.list
@ -0,0 +1,23 @@
+# Initial URLs (first run only)
+#
+# To whitelist a URL prepend '+', to blacklist prepend '-'.
+# Comment lines must begin with '#'.
+
+# de
+http://agd.blogsport.de/
+https://blackblogs.org/blogs/
+https://fau.org/
+http://anarchiv.de/
+http://olaf.bbm.de/die-aktion
+-https://www.anarchistischefoderation.de/
+
+# en
+https://anarchistarchivist.com/
+https://bookshelf.theanarchistlibrary.org/library/
+https://archive.elephanteditions.net/library/
+https://blackrosefed.org/
+https://alpineanarchist.org/
+https://nostate.net/
+https://abolishing.blackblogs.org/
+http://library.nothingness.org/
+-https://www.anarchistfederation.net/
--- a/doc/source/config_template/main.yaml
+++ b/doc/source/config_template/main.yaml
@ -0,0 +1,88 @@
+# Name of this instance
+# Default value: atextcrawler
+# Allowed values: arbitrary string
+instance_name: atextcrawler
+
+# Which kind of instance is this?
+# Default value: prod
+# Allowed values are:
+# - 'dev': development instance
+# - 'staging': staging instance
+# - 'prod': production instance
+instance_type: prod
+
+# Log level
+# Default value: info
+# Allowed values: critical, error, warning, info, debug
+log_level: info
+
+# Plugins directory
+# If given as relative path, it will be relative to the
+# directory of this file (main.yaml).
+# Read documentation on plugins.
+# Default value: plugins
+# Hint: Create a empty __init__.py in the plugins_dir.
+plugins_dir: plugins
+
+# Parameters for access to the PostgreSQL service
+# No default values; must be set.
+postgresql:
+    host: localhost
+    port: 5432
+    database: atextcrawler
+    user: atextcrawler
+    password: ________________________
+
+# Crawling
+crawl:
+    # Number of concurrent workers
+    # Default value: 10
+    # Allowed values: integer >=0 and <=1000
+    #workers: 3
+
+    # Delay in seconds between attempts to fetch items
+    # from site_queue if the last attempt gave no item
+    # Also the delay in seconds after a worker has found
+    # no site to process
+    # Default value: 600
+    # Allowed values: positive number
+    #site_delay: 10
+
+    # Time interval in seconds between site updates when
+    # handling queued base URLs
+    # Default value: 3600
+    # Allowed values: positive number
+    #site_revisit_interval: 3600
+
+    # Delay in seconds between attempts to process
+    # individual resources (pages etc.) of a site
+    # Default value: 5
+    # Allowed values: positive number
+    #resource_delay: 3
+
+    # Default interval in seconds between full crawls of a site
+    # Default value: 864000 (10 days)
+    # Allowed values: positive number
+    #full_crawl_interval: 864000
+
+    # Default interval in seconds between feed crawls of a site
+    # Default value: 86400 (1 day)
+    # Allowed values: positive number
+    #feed_crawl_interval: 86400
+
+# Parameters for access to the ElasticSearch service
+# No default values; must be set.
+elasticsearch:
+    # host on which ES is running
+    host: localhost
+    # API key for accessing ES
+    api_key: "**********************"
+    # API user id
+    id: "**********************"
+    # Index base name (actual index names will have '_text' etc. appended)
+    index_base_name: atext
+
+# Tensorflow access
+tensorflow:
+    # The prediction endpoint of the model server's sentence model
+    model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
--- a/doc/source/config_template/plugins/init.py
+++ b/doc/source/config_template/plugins/init.py
--- a/doc/source/config_template/plugins/filter_resource_path.py
+++ b/doc/source/config_template/plugins/filter_resource_path.py
@ -0,0 +1,22 @@
+"""
+Filter paths found in a resource.
+
+This plugin implements :func:`rp_filter`.
+"""
+
+from typing import Optional
+
+
+def rp_filter(site, durl) -> Optional[str]:
+    """
+    Adjust or filter found paths (may depend on site).
+
+    To filter out a path (i.e., not add it to table `site_path`)
+    return None.
+    """
+    path = durl.pwa()
+    # skip fetching images (linked from a tags; img tags are skipped anyway)
+    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
+        return None
+    path = path.removesuffix('?amp=1')
+    return path
--- a/doc/source/config_template/plugins/filter_site.py
+++ b/doc/source/config_template/plugins/filter_site.py
@ -0,0 +1,47 @@
+"""
+Relevance estimation of sites.
+
+This plugin implements :func:`site_filter`.
+"""
+
+import re
+
+from atextcrawler.models import Site
+
+MIN_RELEVANCE_SCORE = 5
+
+
+async def site_filter(site: Site) -> bool:
+    """
+    Assess relevance of the site (using language-dependent criteria).
+
+    If the site shall be crawled, return True, else False.
+    """
+    # limit to sites in English or German language
+    if not set(['de', 'en']) & set(site.langs):
+        return False
+    score = 0.0
+    for crit_name, weight, langs, crit_re in re_criteria:
+        if '*' in langs or set(langs) & set(site.langs):
+            findings = crit_re.findall(site.startpage_text)
+            if findings:
+                score += weight * len(findings)
+            if site.title and crit_re.search(site.title):
+                score += 4 * weight
+            if site.description and crit_re.search(site.description):
+                score += 4 * weight
+
+    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
+
+    return score >= MIN_RELEVANCE_SCORE
+
+
+re_criteria = {
+    (
+        'anarch',
+        1.0,
+        ('*',),
+        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
+    ),
+    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
+}
--- a/doc/source/config_template/plugins/filter_site_path.py
+++ b/doc/source/config_template/plugins/filter_site_path.py
@ -0,0 +1,24 @@
+"""
+Plugin for filtering paths of a site to be retrieved.
+
+This plugin implements :func:`sp_filter`.
+"""
+
+
+def sp_filter(site, path, robots) -> bool:
+    """
+    Per-site path filter. Return whether the path shall be retrieved.
+    """
+    if not robots.can_fetch_url(site.base_url + path):
+        return False
+    if 'amusewiki' in site.meta_info.get('generator', '').lower():
+        if any(
+            [
+                path.endswith(end)
+                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
+            ]
+        ):
+            return False
+        if '/bbselect?' in path:
+            return False
+    return True
--- a/doc/source/devel/devel.md
+++ b/doc/source/devel/devel.md
@ -0,0 +1,63 @@
+## Setup dev environment
+1. You need python 3.9 or later.
+1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
+1. Clone the repo and setup a virtualenv:
+```
+cd YOUR_DEV_DIR
+git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
+cd atextcrawler
+pipenv install -d
+```
+
+## Configure the instance
+See [installation](installation.md).
+
+## Run
+```
+python -m atextcrawler
+```
+
+## Logging
+Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
+```
+journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
+```
+
+## Upgrading
+Upgrade dev tools:
+```
+pre-commit autoupdate
+```
+
+## Test and clean manually
+```
+AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
+mypy --ignore-missing-imports src/atextcrawler
+isort src/atextcrawler
+black -S -t py37 -l 79 src/atextcrawler
+pybetter --exclude B004,B007,B008 src/atextcrawler
+interrogate -i -I -m -v src/atextcrawler
+```
+
+## Release
+There are no releases (currently).
+
+## Useful commands
+
+### Fetch a resource or a site manually
+```
+python -m atextcrawler.resource https://www.katesharpleylibrary.net/
+python -m atextcrawler.site https://www.katesharpleylibrary.net/
+```
+
+### SQL
+```
+drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
+
+http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
+
+http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
+
+-- stats: sites, paths, resources
+select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
+```
--- a/doc/source/devel/related_work.md
+++ b/doc/source/devel/related_work.md
@ -0,0 +1,64 @@
+## Related work
+* [collection of crawlers](https://github.com/adbar/awesome-crawler)
+* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
+
+### crawlers
+* [acrawler](https://acrawler.readthedocs.io/en/latest/)
+* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
+  * [repo](https://github.com/adbar/trafilatura)
+  * [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
+* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
+* [scrapy](https://docs.scrapy.org/en/latest/)
+* [heritrix3](https://github.com/internetarchive/heritrix3/)
+* [YaCy](https://yacy.net/)
+* [searchmysite](https://searchmysite.net/)
+* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
+* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
+* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
+* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
+
+#### general
+* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
+
+### sitemap parsers
+* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
+
+### url handling
+* [courlan](https://pypi.org/project/courlan/)
+
+### language detection
+* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
+* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
+* [guess_language](https://pypi.org/project/guess-language/)
+* [cld3](https://github.com/google/cld3)
+
+### text extraction
+* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
+
+### deduplication
+* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
+* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
+* remove paragraphs with more than 50% word-7-tuples encountered previously
+
+### Extract more meta tags
+* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
+  https://support.shareaholic.com/hc/en-us/articles/115003085186
+
+### Date parsing dependent on language
+* https://en.wikipedia.org/wiki/Date_format_by_country
+* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
+* https://pypi.org/project/dateparser/
+* https://github.com/ovalhub/pyicu
+* https://github.com/night-crawler/cldr-language-helpers
+* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
+
+ICU
+* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
+* https://gist.github.com/dpk/8325992
+* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
+* https://unicode-org.github.io/icu/userguide/
+* https://unicode-org.github.io/icu-docs/#/icu4c/
+* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
+* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
+* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
+* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
--- a/doc/source/devel/todo.md
+++ b/doc/source/devel/todo.md
@ -0,0 +1,77 @@
+## TODO
+
+* parse html time tags
+
+* site annotations:
+  * categories
+    * historical (no changes any more since n months)
+    * news
+  * local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
+
+* allow for tls in elasticsearch config
+
+* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
+```
+        '&#8211;': '--',
+        '&ndash;': '--',
+        '–': '--',
+        '&#8212;': '---',
+        '&mdash;': '---',
+        '—': '---',
+        '&#8230;': '...',
+        '&hellip;': '...',
+        '…': '...',
+        '&#8220;': '"',
+        '&#8221;': '"',
+        '&#8222;': '"',
+        '&#8243;': '"',
+        '&ldquo;': '"',
+        '&rdquo;': '"',
+        '&bdquo;': '"',
+        '&Prime;': '"',
+        '“':'"',
+        '”':'"',
+        '„':'"',
+        '″':'"',
+        '&#8216;':"'",
+        '&#8217;':"'",
+        '&#8242;':"'",
+        '&lsquo;':"'",
+        '&rsquo;':"'",
+        '&prime;':"'",
+        '‘':"'",
+        '’':"'",
+        '′':"'",
+```
+* normalize quotation marks and punctuation in general
+  * https://unicode-table.com/en/sets/quotation-marks/
+  * https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
+  * https://www.fileformat.info/info/unicode/category/Po/list.htm
+  * https://www.gaijin.at/en/infos/unicode-character-table-punctuation
+  * ⁝
+
+* cancel crawls that take too long
+
+* search for "TODO" in code
+
+* feedparser has support for JSON feeds since commit
+  a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
+  (as of 2020-10-26 in "develop" branch, not part of a release)
+  the version names are 'json1' and 'json11'
+
+* allow site URLs with path, e.g.
+  https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
+
+* add more languages
+
+## Ideas
+* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
+
+* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
+* [langid.py](https://github.com/saffsd/langid.py)
+
+* [gain](https://github.com/gaojiuli/gain)
+* [ruia](https://docs.python-ruia.org/)
+* [demiurge](https://demiurge.readthedocs.io/)
+* [cocrawler](https://github.com/cocrawler/cocrawler/)
+* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
--- a/doc/source/development.rst
+++ b/doc/source/development.rst
@ -0,0 +1,9 @@
+Development
+-----------
+
+.. toctree::
+    :maxdepth: 2
+
+    devel/devel
+    devel/todo
+    devel/related_work
--- a/doc/source/elasticsearch.md
+++ b/doc/source/elasticsearch.md
@ -0,0 +1,119 @@
+# Howto elasticsearch
+
+## Prerequisites
+On the host (virtualization host) we need:
+```
+# cat /etc/sysctl.d/virtual_memory.conf
+vm.max_map_count=262144
+# sysctl -p /etc/sysctl.d/virtual_memory.conf
+```
+
+If this cannot be done, change this file after installing or upgrading elasticsearch:
+```
+/usr/lib/sysctl.d/elasticsearch.conf
+```
+
+## Setup
+
+### Install package
+In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
+
+We do a manual install. If you configure the apt repo instead, also think about setting
+`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
+
+```
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
+shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
+dpkg -i elasticsearch-7.15.2-amd64.deb
+systemctl daemon-reload
+systemctl enable elasticsearch.service
+systemctl start elasticsearch.service
+```
+
+First test:
+```
+http -j GET 127.0.0.1:9200/
+```
+
+### Storage
+
+```
+systemctl stop elasticsearch.service
+mv /var/lib/elasticsearch/ /srv/
+systemctl start elasticsearch.service
+```
+
+Edit /etc/elasticsearch/elasticsearch.yml
+```
+cluster.name: org.a-text.search
+node.name: atext1
+path.data: /srv/elasticsearch
+path.logs: /var/log/elasticsearch
+discovery.seed_hosts: ["atext1.multiname.org"]
+xpack.security.enabled: true
+xpack.security.authc.api_key.enabled: true
+```
+
+```
+systemctl restart elasticsearch
+```
+
+The logfile now is at
+```
+/var/log/elasticsearch/org.a-text.search.log
+```
+
+### Setup passwords
+Setup passwords:
+```
+# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
+Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
+The passwords will be randomly generated and printed to the console.
+Please confirm that you would like to continue [y/N]y
+```
+
+Copy output to /etc/elasticsearch/passwords and
+```
+chmod 400 /etc/elasticsearch/passwords
+```
+
+Check login as user elastic:
+```
+http --auth elastic:************** -j GET http://127.0.0.1:9200/
+```
+
+### Memory limitation
+To limit memory usage
+```
+mkdir /etc/systemd/system/elasticsearch.service.d
+cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
+[Service]
+LimitMEMLOCK=8G
+
+systemctl stop elasticsearch
+systemctl daemon-reload
+systemctl start elasticsearch
+EOF
+```
+and restart the service.
+
+## Usage
+Some useful requests:
+
+### List indices
+```
+http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
+```
+### Health
+```
+http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
+```
+### Node attributes
+```
+http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
+```
+### Create API key
+```
+http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
+```
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -0,0 +1,37 @@
+atextcrawler
+============
+
+atextcrawler is an asynchronous webcrawler indexing text
+for literal and semantic search.
+
+Its client-side counterpart is atextsearch_.
+
+atextcrawler crawls and indexes selected websites.
+It starts from a few seed sites and follows their external links.
+Criteria defined in plugin code determine which linked sites (and 
+which of their resources) are (recursively) added to the pool.
+
+atextcrawler is written in Python, runs a configurable number of
+async workers concurrently (in one process), uses tensorflow for
+embedding (paragraph-sized) text chunks in a (multi-)language model
+and stores metadata in PostgreSQL and texts in elasticsearch.
+
+.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   introduction
+   installation
+   maintenance
+   development
+   reference/modules
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/source/installation.md
+++ b/doc/source/installation.md
@ -0,0 +1,122 @@
+# Installation
+Installation was only tested on Debian bullseye (on amd64).
+The instructions below are for this system.
+(Please adapt to other environments.)
+
+## System packages
+```
+apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
+```
+The protobuf packages are required for python package gcld3 (see below).
+
+## PostgreSQL database
+We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
+```
+createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
+```
+
+## Elasticsearch
+We need access to an elasticsearch instance (over TCP/IP).
+
+Note: TLS is not yet supported, so install this service locally.
+
+See [elasticsearch howto](elasticsearch.md).
+
+## Tensorflow model server
+We need access to a tensorflow model server (over TCP/IP).
+It should serve `universal_sentence_encoder_multilingual`
+or a similar language model.
+
+Note: TLS is not yet supported, so install this service locally.
+
+See [tensorflow howto](tensorflow_model_server.md).
+
+## Setup virtualenv and install atextcrawler
+```
+apt install python3-pip
+adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
+su - atextcrawler
+cat >>.bashrc <<EOF
+export PYTHONPATH=\$HOME/repo/src
+EOF
+pip3 install --user pipenv
+cat >>.profile <<EOF
+PYTHONPATH=\$HOME/repo/src
+PATH=\$HOME/.local/bin:$PATH
+\$HOME/.local/bin/pipenv shell
+EOF
+exit
+su - atextcrawler
+git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
+cd repo
+pipenv sync
+pipenv install --site-packages  # for systemd
+pre-commit install
+```
+
+Note: One of the dependencies, Python package `tldextract`,
+uses this directory for caching:
+```
+$HOME/.cache/python-tldextract/
+```
+
+## Configure atextcrawler
+As user `atextcrawler` execute
+```
+mkdir $HOME/.config
+cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
+```
+
+Edit `$HOME/.config/atextcrawler/main.yaml`.
+
+If you want to override a plugin, copy it to the plugins directory
+and edit it, e.g.
+```
+cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
+```
+
+Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
+
+Check (and print) the instance configuration:
+```
+python -m atextcrawler.config
+```
+
+## Test run
+To see if it works, run `atextcrawler` from the command line:
+```
+python -m atextcrawler
+```
+You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
+
+## Install systemd service
+To make the service persistent, create a systemd unit file
+`/etc/systemd/system/atextcrawler.service` with this content:
+```
+[Unit]
+Description=atextcrawler web crawler
+Documentation=https://gitea.multiname.org/a-text/atextcrawler
+Requires=network.target
+After=network-online.target
+
+[Service]
+Type=simple
+User=atextcrawler
+Group=atextcrawler
+WorkingDirectory=/srv/atextcrawler/repo
+Environment=PYTHONPATH=/srv/atextcrawler/repo/src
+ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
+TimeoutStartSec=30
+ExecStop=/bin/kill -INT $MAINPID
+TimeoutStopSec=180
+Restart=on-failure
+
+[Install]
+WantedBy=multi-user.target
+```
+and
+```
+systemctl daemon-reload
+systemctl enable atextcrawler
+systemctl start atextcrawler
+```
--- a/doc/source/introduction.md
+++ b/doc/source/introduction.md
@ -0,0 +1,66 @@
+# Introduction
+
+## What atextcrawler does:
+* Start from a seed (white+black-)list of website base URLs
+* Loop over sites selected by applying criteria to the content
+  of the site's start page
+* Crawl the site, i.e. loop over resources of the site
+* Extract plaintext content from the resource (html parsing is
+  optimized for html5); discard non-text content, but handle feeds
+  and sitemaps
+* Extract internal and external links; external links contribute
+  to the site list
+* Keep track of the sites and resources in a PostgreSQL database
+* Store plaintext content of resources in an Elasticsearch index
+* Store vector embeddings of plaintexts also in Elasticsearch
+  using tensorflow model server with a multilingual language model
+
+## Architecture
+There is only one python process running concurrently.
+We use asyncio where possible (almost everywhere).
+
+1. There is a queue of websites, see database table `site_queue`.
+   The queue is fed a) on first startup with seeds, b) manually
+   and c) from crawls which find external links.
+   When the queued is handled new sites are stored to table `site`.
+   New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
+   After the queue has been handled there is a delay
+   (`crawl.site_delay` seconds) before repetition.
+1. Updating a site means: the start page is fetched and
+   criteria are applied to its content to determine whether
+   the site is relevant. (It is assumed that (non-)relevance is
+   obvious from the start page already.) If the site is relevant,
+   more information is fetched (e.g. sitemaps).
+1. There is s a configurable number of crawler workers (config
+   `crawl.workers`) which concurrently crawl sites, one at a time
+   per worker. (During the crawl the site is marked as locked using
+   crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
+   Each crawl (with begin time, end time, number of found (new)
+   resources)) is stored in table `crawl`.
+1. Crawls are either full crawls (including all paths reachable
+   through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
+   Feed crawls can happen more frequently (e.g. daily).
+1. When a path is fetched it can result in a MetaResource (feed or
+   sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
+1. If a MetaResource is fetched and it is a sitemap, its paths are
+   added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
+1. Links between sites are stored in table `site_link`.
+
+## Site annotations
+Database table `site_annotation` can have any number of annotations
+for a base_url. While crawling, these annotations are considered:
+Blacklisting or whitelisting has precedence over function `site_filter`
+(in plugin `filter_site`).
+
+Annotations cannot be managed from within atextcrawler;
+this requires another application, usually [`atextsearch`](https://TODO).
+
+Each annotation requires a base_url of the annotated site and
+if a site with this base_url exists in the `site` table,
+it should also be associated with the site's id (column `site_id`).
+
+## Limitations
+* atextcrawler is not optimized for speed; it is meant to be run as a
+  background task on a server with limited resources
+  (or even an SBC, like raspberry pi, with attached storage)
+* atextcrawler only indexes text, no other resources like images
--- a/doc/source/maintenance.md
+++ b/doc/source/maintenance.md
@ -0,0 +1,23 @@
+# Maintenance
+
+## Upgrading
+```
+su - atextcrawler
+pip3 install --user --upgrade pipenv
+cd repo
+git pull
+pipenv sync
+systemctl restart atextcrawler
+```
+
+## Update tldextract
+From time to time run (in the Python virtualenv):
+```
+tldextract --update
+```
+or
+```
+systemctl stop atextcrawler
+rm -r $HOME/.cache/python-tldextract
+systemctl start atextcrawler
+```
--- a/doc/source/tensorflow_model_server.md
+++ b/doc/source/tensorflow_model_server.md
@ -0,0 +1,98 @@
+# Tensorflow model server
+
+## Setup server
+Prepare:
+```
+apt install gnupg2
+```
+Add repo:
+```
+echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
+curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
+```
+Install package:
+```
+apt update
+apt install tensorflow-model-server
+```
+
+## Setup models
+```
+mkdir -p /srv/tensorflow/workdir
+mkdir -p /srv/tensorflow/models
+```
+Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
+```
+# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
+mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
+cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
+wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
+tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
+rm universal-sentence-encoder-multilingual_3.tar.gz
+```
+
+Check:
+```
+tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
+```
+
+Config file `/srv/tensorflow/config`:
+```
+model_config_list: {
+  config: {
+    name: "sentences",
+    base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
+    model_platform: "tensorflow"
+    model_version_policy: {latest{}},
+  },
+  config: {
+    ... (next model)
+  },
+}
+```
+
+## Systemd integration
+Edit /etc/systemd/system/tensorflow.service
+```
+[Unit]
+Description=tensorflow model server
+After=network.target auditd.service
+
+[Service]
+Type=simple
+WorkingDirectory=/srv/tensorflow/workdir
+ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
+KillMode=process
+Restart=on-failure
+RestartSec=30s
+
+[Install]
+WantedBy=multi-user.target
+```
+and
+```
+systemctl daemon-reload
+systemctl enable tensorflow
+systemctl start tensorflow
+```
+
+Check:
+```
+http -j GET http://localhost:9000/v1/models/sentences
+```
+
+## Usage
+Show model details:
+```
+http -j GET http://localhost:9000/v1/models/sentences/metadata
+```
+
+## Docs
+
+* `/usr/bin/tensorflow_model_server --help`
+* https://github.com/tensorflow/serving/
+* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
+* https://github.com/hey-car/tensorflow-model-server
+
+Datasets:
+* https://www.tensorflow.org/datasets/catalog/overview