Put under version control

2021-11-29 09:16:31 +00:00 · 2021-11-29 09:16:31 +00:00 · a6af5b12d2
commit a6af5b12d2
parent d26d23348b
83 changed files with 20130 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,51 @@
+# Backup files
+*.~
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+NOTES
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+htmlcov
+
+# Translations
+*.mo
+
+# mypy cache
+.mypy_cache
+
+# Sphinx documentation
+doc/build/
+doc/source/reference/
+
+# tmp dir
+tmp/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,30 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: check-yaml
+    -   id: check-added-large-files
+-   repo: https://github.com/psf/black
+    rev: 21.11b1
+    hooks:
+    -   id: black
+-   repo: https://github.com/timothycrosley/isort
+    rev: 5.10.1
+    hooks:
+    -   id: isort
+        args: ["--profile", "black", "--filter-files", "-l", "79"]
+-   repo: https://github.com/myint/autoflake
+    rev: v1.4
+    hooks:
+      - id: autoflake
+        args:
+          [
+            "--in-place",
+            "--remove-all-unused-imports",
+            "--ignore-init-module-imports",
+            "--remove-unused-variables",
+          ]
--- a/46
+++ b/46
@ -0,0 +1,46 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+aiohttp = "*"
+async-lru = "*"
+asyncpg = "*"
+beautifulsoup4 = "*"
+elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] }
+elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" }
+feedparser = "*"
+gcld3 = "*"
+# TODO: recheck
+pypandoc = "*"
+pytidylib = "*"
+pytz = "*"
+pyyaml = "*"
+tika = "*"
+tldextract = "*"
+voluptuous = "*"
+simhash = "*"
+async-dns = "*"
+types-pyyaml = "*"
+sphinx-rtd-theme = "*"
+
+[dev-packages]
+mypy = "*"
+pre-commit = "*"
+sphinx = "*"
+myst-parser = "*"
+isort = "*"
+blacken-docs = "*"
+pybetter = "*"
+interrogate = "*"
+autoflake = "*"
+types-pyyaml = "*"
+types-pytz = "*"
+black = "*"
+
+[requires]
+python_version = "3.9"
+
+[pipenv]
+allow_prereleases = true
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@ -0,0 +1,13 @@
+atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search.
+
+Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch)
+
+atextcrawler crawls and indexes selected websites.
+It starts from a few seed sites and follows their external links.
+Criteria defined in plugin code determine which linked sites (and
+which of their resources) are (recursively) added to the pool.
+
+atextcrawler is written in Python, runs a configurable number of
+async workers concurrently (in one process), uses tensorflow for
+embedding (paragraph-sized) text chunks in a (multi-)language model
+and stores metadata in PostgreSQL and texts in elasticsearch.
--- a/doc/Makefile
+++ b/doc/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -0,0 +1,71 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
+import os
+import sys
+
+proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
+sys.path.insert(0, proj_dir + '/src')
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'atextcrawler'
+copyright = '2021, ibu radempa'
+author = 'ibu radempa'
+
+# The full version, including alpha/beta/rc tags
+release = '0.1.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'myst_parser',
+    'sphinx.ext.graphviz',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+autosummary_generate = True
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
--- a/doc/source/config_template/initial_data/seed_urls.list
+++ b/doc/source/config_template/initial_data/seed_urls.list
@ -0,0 +1,23 @@
+# Initial URLs (first run only)
+#
+# To whitelist a URL prepend '+', to blacklist prepend '-'.
+# Comment lines must begin with '#'.
+
+# de
+http://agd.blogsport.de/
+https://blackblogs.org/blogs/
+https://fau.org/
+http://anarchiv.de/
+http://olaf.bbm.de/die-aktion
+-https://www.anarchistischefoderation.de/
+
+# en
+https://anarchistarchivist.com/
+https://bookshelf.theanarchistlibrary.org/library/
+https://archive.elephanteditions.net/library/
+https://blackrosefed.org/
+https://alpineanarchist.org/
+https://nostate.net/
+https://abolishing.blackblogs.org/
+http://library.nothingness.org/
+-https://www.anarchistfederation.net/
--- a/doc/source/config_template/main.yaml
+++ b/doc/source/config_template/main.yaml
@ -0,0 +1,88 @@
+# Name of this instance
+# Default value: atextcrawler
+# Allowed values: arbitrary string
+instance_name: atextcrawler
+
+# Which kind of instance is this?
+# Default value: prod
+# Allowed values are:
+# - 'dev': development instance
+# - 'staging': staging instance
+# - 'prod': production instance
+instance_type: prod
+
+# Log level
+# Default value: info
+# Allowed values: critical, error, warning, info, debug
+log_level: info
+
+# Plugins directory
+# If given as relative path, it will be relative to the
+# directory of this file (main.yaml).
+# Read documentation on plugins.
+# Default value: plugins
+# Hint: Create a empty __init__.py in the plugins_dir.
+plugins_dir: plugins
+
+# Parameters for access to the PostgreSQL service
+# No default values; must be set.
+postgresql:
+    host: localhost
+    port: 5432
+    database: atextcrawler
+    user: atextcrawler
+    password: ________________________
+
+# Crawling
+crawl:
+    # Number of concurrent workers
+    # Default value: 10
+    # Allowed values: integer >=0 and <=1000
+    #workers: 3
+
+    # Delay in seconds between attempts to fetch items
+    # from site_queue if the last attempt gave no item
+    # Also the delay in seconds after a worker has found
+    # no site to process
+    # Default value: 600
+    # Allowed values: positive number
+    #site_delay: 10
+
+    # Time interval in seconds between site updates when
+    # handling queued base URLs
+    # Default value: 3600
+    # Allowed values: positive number
+    #site_revisit_interval: 3600
+
+    # Delay in seconds between attempts to process
+    # individual resources (pages etc.) of a site
+    # Default value: 5
+    # Allowed values: positive number
+    #resource_delay: 3
+
+    # Default interval in seconds between full crawls of a site
+    # Default value: 864000 (10 days)
+    # Allowed values: positive number
+    #full_crawl_interval: 864000
+
+    # Default interval in seconds between feed crawls of a site
+    # Default value: 86400 (1 day)
+    # Allowed values: positive number
+    #feed_crawl_interval: 86400
+
+# Parameters for access to the ElasticSearch service
+# No default values; must be set.
+elasticsearch:
+    # host on which ES is running
+    host: localhost
+    # API key for accessing ES
+    api_key: "**********************"
+    # API user id
+    id: "**********************"
+    # Index base name (actual index names will have '_text' etc. appended)
+    index_base_name: atext
+
+# Tensorflow access
+tensorflow:
+    # The prediction endpoint of the model server's sentence model
+    model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
--- a/doc/source/config_template/plugins/init.py
+++ b/doc/source/config_template/plugins/init.py
--- a/doc/source/config_template/plugins/filter_resource_path.py
+++ b/doc/source/config_template/plugins/filter_resource_path.py
@ -0,0 +1,22 @@
+"""
+Filter paths found in a resource.
+
+This plugin implements :func:`rp_filter`.
+"""
+
+from typing import Optional
+
+
+def rp_filter(site, durl) -> Optional[str]:
+    """
+    Adjust or filter found paths (may depend on site).
+
+    To filter out a path (i.e., not add it to table `site_path`)
+    return None.
+    """
+    path = durl.pwa()
+    # skip fetching images (linked from a tags; img tags are skipped anyway)
+    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
+        return None
+    path = path.removesuffix('?amp=1')
+    return path
--- a/doc/source/config_template/plugins/filter_site.py
+++ b/doc/source/config_template/plugins/filter_site.py
@ -0,0 +1,47 @@
+"""
+Relevance estimation of sites.
+
+This plugin implements :func:`site_filter`.
+"""
+
+import re
+
+from atextcrawler.models import Site
+
+MIN_RELEVANCE_SCORE = 5
+
+
+async def site_filter(site: Site) -> bool:
+    """
+    Assess relevance of the site (using language-dependent criteria).
+
+    If the site shall be crawled, return True, else False.
+    """
+    # limit to sites in English or German language
+    if not set(['de', 'en']) & set(site.langs):
+        return False
+    score = 0.0
+    for crit_name, weight, langs, crit_re in re_criteria:
+        if '*' in langs or set(langs) & set(site.langs):
+            findings = crit_re.findall(site.startpage_text)
+            if findings:
+                score += weight * len(findings)
+            if site.title and crit_re.search(site.title):
+                score += 4 * weight
+            if site.description and crit_re.search(site.description):
+                score += 4 * weight
+
+    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
+
+    return score >= MIN_RELEVANCE_SCORE
+
+
+re_criteria = {
+    (
+        'anarch',
+        1.0,
+        ('*',),
+        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
+    ),
+    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
+}
--- a/doc/source/config_template/plugins/filter_site_path.py
+++ b/doc/source/config_template/plugins/filter_site_path.py
@ -0,0 +1,24 @@
+"""
+Plugin for filtering paths of a site to be retrieved.
+
+This plugin implements :func:`sp_filter`.
+"""
+
+
+def sp_filter(site, path, robots) -> bool:
+    """
+    Per-site path filter. Return whether the path shall be retrieved.
+    """
+    if not robots.can_fetch_url(site.base_url + path):
+        return False
+    if 'amusewiki' in site.meta_info.get('generator', '').lower():
+        if any(
+            [
+                path.endswith(end)
+                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
+            ]
+        ):
+            return False
+        if '/bbselect?' in path:
+            return False
+    return True
--- a/doc/source/devel/devel.md
+++ b/doc/source/devel/devel.md
@ -0,0 +1,63 @@
+## Setup dev environment
+1. You need python 3.9 or later.
+1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
+1. Clone the repo and setup a virtualenv:
+```
+cd YOUR_DEV_DIR
+git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
+cd atextcrawler
+pipenv install -d
+```
+
+## Configure the instance
+See [installation](installation.md).
+
+## Run
+```
+python -m atextcrawler
+```
+
+## Logging
+Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
+```
+journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
+```
+
+## Upgrading
+Upgrade dev tools:
+```
+pre-commit autoupdate
+```
+
+## Test and clean manually
+```
+AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
+mypy --ignore-missing-imports src/atextcrawler
+isort src/atextcrawler
+black -S -t py37 -l 79 src/atextcrawler
+pybetter --exclude B004,B007,B008 src/atextcrawler
+interrogate -i -I -m -v src/atextcrawler
+```
+
+## Release
+There are no releases (currently).
+
+## Useful commands
+
+### Fetch a resource or a site manually
+```
+python -m atextcrawler.resource https://www.katesharpleylibrary.net/
+python -m atextcrawler.site https://www.katesharpleylibrary.net/
+```
+
+### SQL
+```
+drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
+
+http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
+
+http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
+
+-- stats: sites, paths, resources
+select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
+```
--- a/doc/source/devel/related_work.md
+++ b/doc/source/devel/related_work.md
@ -0,0 +1,64 @@
+## Related work
+* [collection of crawlers](https://github.com/adbar/awesome-crawler)
+* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
+
+### crawlers
+* [acrawler](https://acrawler.readthedocs.io/en/latest/)
+* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
+  * [repo](https://github.com/adbar/trafilatura)
+  * [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
+* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
+* [scrapy](https://docs.scrapy.org/en/latest/)
+* [heritrix3](https://github.com/internetarchive/heritrix3/)
+* [YaCy](https://yacy.net/)
+* [searchmysite](https://searchmysite.net/)
+* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
+* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
+* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
+* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
+
+#### general
+* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
+
+### sitemap parsers
+* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
+
+### url handling
+* [courlan](https://pypi.org/project/courlan/)
+
+### language detection
+* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
+* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
+* [guess_language](https://pypi.org/project/guess-language/)
+* [cld3](https://github.com/google/cld3)
+
+### text extraction
+* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
+
+### deduplication
+* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
+* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
+* remove paragraphs with more than 50% word-7-tuples encountered previously
+
+### Extract more meta tags
+* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
+  https://support.shareaholic.com/hc/en-us/articles/115003085186
+
+### Date parsing dependent on language
+* https://en.wikipedia.org/wiki/Date_format_by_country
+* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
+* https://pypi.org/project/dateparser/
+* https://github.com/ovalhub/pyicu
+* https://github.com/night-crawler/cldr-language-helpers
+* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
+
+ICU
+* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
+* https://gist.github.com/dpk/8325992
+* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
+* https://unicode-org.github.io/icu/userguide/
+* https://unicode-org.github.io/icu-docs/#/icu4c/
+* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
+* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
+* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
+* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
--- a/doc/source/devel/todo.md
+++ b/doc/source/devel/todo.md
@ -0,0 +1,77 @@
+## TODO
+
+* parse html time tags
+
+* site annotations:
+  * categories
+    * historical (no changes any more since n months)
+    * news
+  * local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
+
+* allow for tls in elasticsearch config
+
+* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
+```
+        '&#8211;': '--',
+        '&ndash;': '--',
+        '–': '--',
+        '&#8212;': '---',
+        '&mdash;': '---',
+        '—': '---',
+        '&#8230;': '...',
+        '&hellip;': '...',
+        '…': '...',
+        '&#8220;': '"',
+        '&#8221;': '"',
+        '&#8222;': '"',
+        '&#8243;': '"',
+        '&ldquo;': '"',
+        '&rdquo;': '"',
+        '&bdquo;': '"',
+        '&Prime;': '"',
+        '“':'"',
+        '”':'"',
+        '„':'"',
+        '″':'"',
+        '&#8216;':"'",
+        '&#8217;':"'",
+        '&#8242;':"'",
+        '&lsquo;':"'",
+        '&rsquo;':"'",
+        '&prime;':"'",
+        '‘':"'",
+        '’':"'",
+        '′':"'",
+```
+* normalize quotation marks and punctuation in general
+  * https://unicode-table.com/en/sets/quotation-marks/
+  * https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
+  * https://www.fileformat.info/info/unicode/category/Po/list.htm
+  * https://www.gaijin.at/en/infos/unicode-character-table-punctuation
+  * ⁝
+
+* cancel crawls that take too long
+
+* search for "TODO" in code
+
+* feedparser has support for JSON feeds since commit
+  a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
+  (as of 2020-10-26 in "develop" branch, not part of a release)
+  the version names are 'json1' and 'json11'
+
+* allow site URLs with path, e.g.
+  https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
+
+* add more languages
+
+## Ideas
+* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
+
+* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
+* [langid.py](https://github.com/saffsd/langid.py)
+
+* [gain](https://github.com/gaojiuli/gain)
+* [ruia](https://docs.python-ruia.org/)
+* [demiurge](https://demiurge.readthedocs.io/)
+* [cocrawler](https://github.com/cocrawler/cocrawler/)
+* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
--- a/doc/source/development.rst
+++ b/doc/source/development.rst
@ -0,0 +1,9 @@
+Development
+-----------
+
+.. toctree::
+    :maxdepth: 2
+
+    devel/devel
+    devel/todo
+    devel/related_work
--- a/doc/source/elasticsearch.md
+++ b/doc/source/elasticsearch.md
@ -0,0 +1,119 @@
+# Howto elasticsearch
+
+## Prerequisites
+On the host (virtualization host) we need:
+```
+# cat /etc/sysctl.d/virtual_memory.conf
+vm.max_map_count=262144
+# sysctl -p /etc/sysctl.d/virtual_memory.conf
+```
+
+If this cannot be done, change this file after installing or upgrading elasticsearch:
+```
+/usr/lib/sysctl.d/elasticsearch.conf
+```
+
+## Setup
+
+### Install package
+In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
+
+We do a manual install. If you configure the apt repo instead, also think about setting
+`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
+
+```
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
+wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
+shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
+dpkg -i elasticsearch-7.15.2-amd64.deb
+systemctl daemon-reload
+systemctl enable elasticsearch.service
+systemctl start elasticsearch.service
+```
+
+First test:
+```
+http -j GET 127.0.0.1:9200/
+```
+
+### Storage
+
+```
+systemctl stop elasticsearch.service
+mv /var/lib/elasticsearch/ /srv/
+systemctl start elasticsearch.service
+```
+
+Edit /etc/elasticsearch/elasticsearch.yml
+```
+cluster.name: org.a-text.search
+node.name: atext1
+path.data: /srv/elasticsearch
+path.logs: /var/log/elasticsearch
+discovery.seed_hosts: ["atext1.multiname.org"]
+xpack.security.enabled: true
+xpack.security.authc.api_key.enabled: true
+```
+
+```
+systemctl restart elasticsearch
+```
+
+The logfile now is at
+```
+/var/log/elasticsearch/org.a-text.search.log
+```
+
+### Setup passwords
+Setup passwords:
+```
+# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
+Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
+The passwords will be randomly generated and printed to the console.
+Please confirm that you would like to continue [y/N]y
+```
+
+Copy output to /etc/elasticsearch/passwords and
+```
+chmod 400 /etc/elasticsearch/passwords
+```
+
+Check login as user elastic:
+```
+http --auth elastic:************** -j GET http://127.0.0.1:9200/
+```
+
+### Memory limitation
+To limit memory usage
+```
+mkdir /etc/systemd/system/elasticsearch.service.d
+cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
+[Service]
+LimitMEMLOCK=8G
+
+systemctl stop elasticsearch
+systemctl daemon-reload
+systemctl start elasticsearch
+EOF
+```
+and restart the service.
+
+## Usage
+Some useful requests:
+
+### List indices
+```
+http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
+```
+### Health
+```
+http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
+```
+### Node attributes
+```
+http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
+```
+### Create API key
+```
+http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
+```
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -0,0 +1,37 @@
+atextcrawler
+============
+
+atextcrawler is an asynchronous webcrawler indexing text
+for literal and semantic search.
+
+Its client-side counterpart is atextsearch_.
+
+atextcrawler crawls and indexes selected websites.
+It starts from a few seed sites and follows their external links.
+Criteria defined in plugin code determine which linked sites (and 
+which of their resources) are (recursively) added to the pool.
+
+atextcrawler is written in Python, runs a configurable number of
+async workers concurrently (in one process), uses tensorflow for
+embedding (paragraph-sized) text chunks in a (multi-)language model
+and stores metadata in PostgreSQL and texts in elasticsearch.
+
+.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   introduction
+   installation
+   maintenance
+   development
+   reference/modules
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/source/installation.md
+++ b/doc/source/installation.md
@ -0,0 +1,122 @@
+# Installation
+Installation was only tested on Debian bullseye (on amd64).
+The instructions below are for this system.
+(Please adapt to other environments.)
+
+## System packages
+```
+apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
+```
+The protobuf packages are required for python package gcld3 (see below).
+
+## PostgreSQL database
+We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
+```
+createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
+```
+
+## Elasticsearch
+We need access to an elasticsearch instance (over TCP/IP).
+
+Note: TLS is not yet supported, so install this service locally.
+
+See [elasticsearch howto](elasticsearch.md).
+
+## Tensorflow model server
+We need access to a tensorflow model server (over TCP/IP).
+It should serve `universal_sentence_encoder_multilingual`
+or a similar language model.
+
+Note: TLS is not yet supported, so install this service locally.
+
+See [tensorflow howto](tensorflow_model_server.md).
+
+## Setup virtualenv and install atextcrawler
+```
+apt install python3-pip
+adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
+su - atextcrawler
+cat >>.bashrc <<EOF
+export PYTHONPATH=\$HOME/repo/src
+EOF
+pip3 install --user pipenv
+cat >>.profile <<EOF
+PYTHONPATH=\$HOME/repo/src
+PATH=\$HOME/.local/bin:$PATH
+\$HOME/.local/bin/pipenv shell
+EOF
+exit
+su - atextcrawler
+git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
+cd repo
+pipenv sync
+pipenv install --site-packages  # for systemd
+pre-commit install
+```
+
+Note: One of the dependencies, Python package `tldextract`,
+uses this directory for caching:
+```
+$HOME/.cache/python-tldextract/
+```
+
+## Configure atextcrawler
+As user `atextcrawler` execute
+```
+mkdir $HOME/.config
+cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
+```
+
+Edit `$HOME/.config/atextcrawler/main.yaml`.
+
+If you want to override a plugin, copy it to the plugins directory
+and edit it, e.g.
+```
+cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
+```
+
+Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
+
+Check (and print) the instance configuration:
+```
+python -m atextcrawler.config
+```
+
+## Test run
+To see if it works, run `atextcrawler` from the command line:
+```
+python -m atextcrawler
+```
+You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
+
+## Install systemd service
+To make the service persistent, create a systemd unit file
+`/etc/systemd/system/atextcrawler.service` with this content:
+```
+[Unit]
+Description=atextcrawler web crawler
+Documentation=https://gitea.multiname.org/a-text/atextcrawler
+Requires=network.target
+After=network-online.target
+
+[Service]
+Type=simple
+User=atextcrawler
+Group=atextcrawler
+WorkingDirectory=/srv/atextcrawler/repo
+Environment=PYTHONPATH=/srv/atextcrawler/repo/src
+ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
+TimeoutStartSec=30
+ExecStop=/bin/kill -INT $MAINPID
+TimeoutStopSec=180
+Restart=on-failure
+
+[Install]
+WantedBy=multi-user.target
+```
+and
+```
+systemctl daemon-reload
+systemctl enable atextcrawler
+systemctl start atextcrawler
+```
--- a/doc/source/introduction.md
+++ b/doc/source/introduction.md
@ -0,0 +1,66 @@
+# Introduction
+
+## What atextcrawler does:
+* Start from a seed (white+black-)list of website base URLs
+* Loop over sites selected by applying criteria to the content
+  of the site's start page
+* Crawl the site, i.e. loop over resources of the site
+* Extract plaintext content from the resource (html parsing is
+  optimized for html5); discard non-text content, but handle feeds
+  and sitemaps
+* Extract internal and external links; external links contribute
+  to the site list
+* Keep track of the sites and resources in a PostgreSQL database
+* Store plaintext content of resources in an Elasticsearch index
+* Store vector embeddings of plaintexts also in Elasticsearch
+  using tensorflow model server with a multilingual language model
+
+## Architecture
+There is only one python process running concurrently.
+We use asyncio where possible (almost everywhere).
+
+1. There is a queue of websites, see database table `site_queue`.
+   The queue is fed a) on first startup with seeds, b) manually
+   and c) from crawls which find external links.
+   When the queued is handled new sites are stored to table `site`.
+   New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
+   After the queue has been handled there is a delay
+   (`crawl.site_delay` seconds) before repetition.
+1. Updating a site means: the start page is fetched and
+   criteria are applied to its content to determine whether
+   the site is relevant. (It is assumed that (non-)relevance is
+   obvious from the start page already.) If the site is relevant,
+   more information is fetched (e.g. sitemaps).
+1. There is s a configurable number of crawler workers (config
+   `crawl.workers`) which concurrently crawl sites, one at a time
+   per worker. (During the crawl the site is marked as locked using
+   crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
+   Each crawl (with begin time, end time, number of found (new)
+   resources)) is stored in table `crawl`.
+1. Crawls are either full crawls (including all paths reachable
+   through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
+   Feed crawls can happen more frequently (e.g. daily).
+1. When a path is fetched it can result in a MetaResource (feed or
+   sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
+1. If a MetaResource is fetched and it is a sitemap, its paths are
+   added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
+1. Links between sites are stored in table `site_link`.
+
+## Site annotations
+Database table `site_annotation` can have any number of annotations
+for a base_url. While crawling, these annotations are considered:
+Blacklisting or whitelisting has precedence over function `site_filter`
+(in plugin `filter_site`).
+
+Annotations cannot be managed from within atextcrawler;
+this requires another application, usually [`atextsearch`](https://TODO).
+
+Each annotation requires a base_url of the annotated site and
+if a site with this base_url exists in the `site` table,
+it should also be associated with the site's id (column `site_id`).
+
+## Limitations
+* atextcrawler is not optimized for speed; it is meant to be run as a
+  background task on a server with limited resources
+  (or even an SBC, like raspberry pi, with attached storage)
+* atextcrawler only indexes text, no other resources like images
--- a/doc/source/maintenance.md
+++ b/doc/source/maintenance.md
@ -0,0 +1,23 @@
+# Maintenance
+
+## Upgrading
+```
+su - atextcrawler
+pip3 install --user --upgrade pipenv
+cd repo
+git pull
+pipenv sync
+systemctl restart atextcrawler
+```
+
+## Update tldextract
+From time to time run (in the Python virtualenv):
+```
+tldextract --update
+```
+or
+```
+systemctl stop atextcrawler
+rm -r $HOME/.cache/python-tldextract
+systemctl start atextcrawler
+```
--- a/doc/source/tensorflow_model_server.md
+++ b/doc/source/tensorflow_model_server.md
@ -0,0 +1,98 @@
+# Tensorflow model server
+
+## Setup server
+Prepare:
+```
+apt install gnupg2
+```
+Add repo:
+```
+echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
+curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
+```
+Install package:
+```
+apt update
+apt install tensorflow-model-server
+```
+
+## Setup models
+```
+mkdir -p /srv/tensorflow/workdir
+mkdir -p /srv/tensorflow/models
+```
+Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
+```
+# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
+mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
+cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
+wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
+tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
+rm universal-sentence-encoder-multilingual_3.tar.gz
+```
+
+Check:
+```
+tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
+```
+
+Config file `/srv/tensorflow/config`:
+```
+model_config_list: {
+  config: {
+    name: "sentences",
+    base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
+    model_platform: "tensorflow"
+    model_version_policy: {latest{}},
+  },
+  config: {
+    ... (next model)
+  },
+}
+```
+
+## Systemd integration
+Edit /etc/systemd/system/tensorflow.service
+```
+[Unit]
+Description=tensorflow model server
+After=network.target auditd.service
+
+[Service]
+Type=simple
+WorkingDirectory=/srv/tensorflow/workdir
+ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
+KillMode=process
+Restart=on-failure
+RestartSec=30s
+
+[Install]
+WantedBy=multi-user.target
+```
+and
+```
+systemctl daemon-reload
+systemctl enable tensorflow
+systemctl start tensorflow
+```
+
+Check:
+```
+http -j GET http://localhost:9000/v1/models/sentences
+```
+
+## Usage
+Show model details:
+```
+http -j GET http://localhost:9000/v1/models/sentences/metadata
+```
+
+## Docs
+
+* `/usr/bin/tensorflow_model_server --help`
+* https://github.com/tensorflow/serving/
+* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
+* https://github.com/hey-car/tensorflow-model-server
+
+Datasets:
+* https://www.tensorflow.org/datasets/catalog/overview
--- a/license.txt
+++ b/license.txt
@ -0,0 +1,48 @@
+ANTI-AUTHORITARIAN LICENSE version 1.0
+________________________________________________________________________________
+
+Obviously, this license is relevant to all who are bound by law.
+
+The licensee ("you") must not be a commercial, military, clerical or
+governmental entity. For this license the term "software" means the program
+code, documentation as well as other data (for instance, language files).
+
+Subject to the respective terms and conditions described below the licensee
+is granted the non-exclusive and non-transferable license to:
+A. make copies of the software
+B. create derivative works ("modifications")
+C. install and run copies or modifications of the software on any number of
+   servers, thereby making them usable for the licensee and possibly others
+D. offer or give copies or modifications of the software, or parts of the
+   unmodified or modified software to others
+
+For these permissions the respective conditions stated below must be met:
+* For permission A condition 1 must be met.
+* For permission B all of the conditions 1, 3, 4 must be met.
+* For permission C all of the conditions 2, 3 must be met.
+* For permission D all of the conditions 1, 2, 3, 4, 5 must be met.
+
+These are the conditions:
+1. You include this copyright notice and license in any copy or modification.
+   In files that contain a reference to it you preserve this reference.
+2. You do not use this software or any modification of it for any commercial
+   purpose or for monetary gain, and also not for any military, governmental
+   or religious purpose; here with commercial purpose we mean activities which
+   have among their goals to make profit, be it monetary profit or any other
+   kind of profit that may entail or contribute to monetary profit.
+3. Demos or screenshots of the modified or unmodified software must not be
+   published in any medium which requires the viewers to pay money in order
+   to see the contents; here money paid for mere internet connectivity (i.e.,
+   independent of the content supplier) is to be disregarded.
+4. You do not impose any further restrictions on this software or any
+   derivative works beyond those restrictions herein.
+5. The copy or modification must include source code, and must allow
+   distribution in source code as well as compiled form. The source code
+   must be the preferred form in which a programmer would modify the program.
+   Deliberately obfuscated source code is not allowed. Intermediate forms
+   such as the output of a preprocessor or translator are not allowed.
+
+For this license itself, if re-used for other software, the following
+copyright and license applies (copyheart license):
+
+♡ Copying is an act of love. Please copy.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,10 @@
+# TOML formatted file; see PEP 518
+
+[tool.isort]
+profile = "black"
+#multi_line_output = 3
+
+[tool.black]
+line-length = 79
+target_version = ['py39']
+skip-string-normalization = true
--- a/src/atextcrawler/init.py
+++ b/src/atextcrawler/init.py
--- a/src/atextcrawler/main.py
+++ b/src/atextcrawler/main.py
@ -0,0 +1,12 @@
+"""
+atextcrawler application execution entry point.
+"""
+
+import asyncio
+
+from .application import Application
+from .config import Config
+
+if __name__ == '__main__':
+    config = Config().get()
+    asyncio.run(Application(config).run())
--- a/src/atextcrawler/application.py
+++ b/src/atextcrawler/application.py
@ -0,0 +1,204 @@
+"""
+atextcrawler application.
+"""
+
+import asyncio
+import importlib
+import logging
+import signal
+import sys
+
+from systemd.journal import JournalHandler
+
+from .config import Config
+from .crawl import CrawlWorker
+from .db import PGPool
+from .search import shutdown_engine, startup_engine
+from .site import load_seeds, process_site_queue
+
+plugin_names = ['filter_site', 'filter_site_path', 'filter_resource_path']
+
+
+class Application:
+    """
+    atextcrawler application.
+
+    The basic structure of the application is this:
+      * one site crawler works just on the site_queue: fetching start pages
+        of sites and storing updated site information in table sites
+      * N other CrawlWorkers each do this in a loop:
+        checkout a site that is due for crawl and crawl its resources;
+        they fill the site_queue
+    """
+
+    running = True
+
+    def __init__(self, config=None):
+        if config is None:
+            config = Config().get()
+        self.config = config
+        self.instance_name = config['instance_name']
+        self.instance_type = config['instance_type']
+        log_level = getattr(
+            logging, config['log_level'].upper(), logging.CRITICAL
+        )
+        self.logger = logging.getLogger('atextcrawler')
+        self.logger.setLevel(log_level)
+        if self.instance_type == 'dev':
+            self.logger.addHandler(logging.StreamHandler())
+        else:
+            self.logger.addHandler(
+                JournalHandler(SYSLOG_IDENTIFIER=self.instance_name)
+            )
+        self.logger.propagate = False
+        self.channel = 'atextcrawler_' + self.config['instance_name']
+        msg = f'Instance "{self}" initializing'
+        self.logger.info(msg)
+        self.plugins = self._load_plugins()
+
+    def __str__(self):
+        return self.instance_name
+
+    def _load_plugins(self):
+        """
+        Return a dict mapping plugin names to modules.
+        """
+        modules = {}
+        old_path = sys.path
+        for name in plugin_names:
+            try:
+                plugins_dir = self.config['plugins_dir']
+                sys.path.insert(0, plugins_dir)
+                module = importlib.import_module(name)
+                msg = f'Loading plugin "{name}" from {plugins_dir}'
+            except:
+                module = importlib.import_module(
+                    'atextcrawler.plugin_defaults.' + name
+                )
+                msg = f'Loading plugin "{name}" from default location'
+            self.logger.info(msg)
+            modules[name] = module
+        sys.path = old_path
+        return modules
+
+    async def run(self):
+        """
+        Application lifecycle.
+        """
+        await asyncio.gather(self.wait_for_shutdown(), self.startup())
+        await self.shutdown()
+
+    async def startup(self):
+        """
+        Asynchronous startup.
+        """
+        msg = f'Instance "{self}" starting components'
+        self.logger.info(msg)
+        self.search_engine = await startup_engine(self.config)
+        self.pgpool = await PGPool(self.config['postgresql'])
+        self.pool = self.pgpool.pool
+        await load_seeds(self.config, self.pool)
+        await reset_site_locks(self.pool)
+        worker_count = self.config['crawl']['workers']
+        self.workers = []
+        for worker_number in range(worker_count):
+            worker = await CrawlWorker(self, worker_number, self.pool)
+            self.workers.append(worker)
+        worker_coros = [worker.run() for worker in self.workers]
+        await asyncio.gather(
+            process_site_queue(self, self.pool),
+            self.handle_notifications(),
+            *worker_coros,
+        )
+
+    async def wait_for_shutdown(self):
+        """
+        Create a shutdown event (:class:`asyncio.Event`) and wait for it.
+
+        The event will be set by a signal handler for SIGINT
+        and SIGTERM signals (see :meth:`Application.handle_shutdown_signal`).
+        """
+        self.shutdown_event = asyncio.Event()
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            asyncio.get_running_loop().add_signal_handler(
+                sig, self.handle_shutdown_signal
+            )
+        self.logger.debug(f'{self} waiting for shutdown event')
+        await self.shutdown_event.wait()
+        self.logger.info(f'Instance "{self}" shutdown event')
+
+    def handle_shutdown_signal(self):
+        """
+        Handle shutdown signal.
+        """
+        if self.shutdown_event.is_set():
+            return
+        self.shutdown_event.set()
+        self.running = False
+
+    async def shutdown(self):
+        """
+        Asynchronous shutdown.
+        """
+        self.logger.debug(f'Instance "{self}" shutting down')
+        await self.notify_conn.remove_listener(
+            self.channel, self.listen_callback
+        )
+        await self.pool.release(self.notify_conn)
+        for worker in self.workers:
+            await worker.shutdown()
+        await shutdown_engine(self.search_engine)
+        await self.pgpool.shutdown()
+        self.logger.info(f'Instance "{self}" shutdown completed')
+
+    async def handle_notifications(self):
+        """
+        Handle notifications using PostgreSQL's NOTIFY/LISTEN.
+        """
+        self.notify_conn = await self.pool.acquire()
+        await self.notify_conn.add_listener(self.channel, self.listen_callback)
+
+    def listen_callback(self, *args):
+        """
+        Handle notify event from PostgreSQL.
+        """
+        channel = args[2]
+        if channel != self.channel:
+            return
+        message = args[3]
+        if message.startswith('site_update '):
+            try:
+                site_id = int(message.removeprefix('site_update '))
+                for worker in self.workers:
+                    if worker.site and site_id == worker.site.id_:
+                        msg = (
+                            f'Cancelling worker {worker.worker_number}'
+                            f' (site={site_id}) due to site_update'
+                        )
+                        self.logger.info(msg)
+                        worker.running = False
+            except:
+                pass
+
+    async def sleep(self, duration, t_slice=3):
+        """
+        Sleep for *duration* seconds while self.running.
+
+        Check self.running every *t_slice* seconds.
+        """
+        remaining = duration
+        while remaining > 0 and self.running:
+            await asyncio.sleep(min(t_slice, remaining))
+            remaining -= t_slice
+
+
+async def reset_site_locks(pool):
+    """
+    Remove locks leftover from last run: Set crawl_active=false for all sites.
+
+    This is relevant when the application was not shutdown properly (e.g.
+    when the process was killed).
+    """
+    async with pool.acquire() as conn:
+        sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true"
+        await conn.execute(sql)
--- a/src/atextcrawler/assets/iana_langs
+++ b/src/atextcrawler/assets/iana_langs
@ -0,0 +1,7 @@
+The recommended language tags to use in webpages are from
+the IANA Language Subtag Registry (BCP47), see:
+https://www.w3.org/International/questions/qa-html-language-declarations
+https://r12a.github.io/app-subtags/
+
+
+wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_  | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //'
--- a/src/atextcrawler/assets/iso_639-1
+++ b/src/atextcrawler/assets/iso_639-1
@ -0,0 +1,219 @@
+aa
+ab
+ae
+af
+ak
+am
+an
+ar
+as
+av
+ay
+az
+ba
+be
+bg
+bh
+bi
+bm
+bn
+bo
+br
+bs
+ca
+ca
+ce
+ch
+co
+cr
+cs
+cu
+cu
+cu
+cu
+cu
+cv
+cy
+da
+de
+dv
+dv
+dv
+dz
+ee
+el
+en
+eo
+es
+es
+et
+eu
+fa
+ff
+fi
+fj
+fo
+fr
+fy
+ga
+gd
+gd
+gl
+gn
+gu
+gv
+ha
+he
+hi
+ho
+hr
+ht
+ht
+hu
+hy
+hz
+ia
+id
+ie
+ie
+ig
+ii
+ii
+ik
+io
+is
+it
+iu
+ja
+jv
+ka
+kg
+ki
+ki
+kj
+kj
+kk
+kl
+kl
+km
+kn
+ko
+kr
+ks
+ku
+kv
+kw
+ky
+ky
+la
+lb
+lb
+lg
+li
+li
+li
+ln
+lo
+lt
+lu
+lv
+mg
+mh
+mi
+mk
+ml
+mn
+mr
+ms
+mt
+my
+na
+nb
+nb
+nd
+nd
+ne
+ng
+nl
+nl
+nn
+nn
+no
+nr
+nr
+nv
+nv
+ny
+ny
+ny
+oc
+oj
+om
+or
+os
+os
+pa
+pa
+pi
+pl
+ps
+ps
+pt
+qu
+rm
+rn
+ro
+ro
+ro
+ru
+rw
+sa
+sc
+sd
+se
+sg
+si
+si
+sk
+sl
+sm
+sn
+so
+sq
+sr
+ss
+st
+su
+sv
+sw
+ta
+te
+tg
+th
+ti
+tk
+tl
+tn
+to
+tr
+ts
+tt
+tw
+ty
+ug
+ug
+uk
+ur
+uz
+ve
+vi
+vo
+wa
+wo
+xh
+yi
+yo
+za
+za
+zh
+zu
--- a/src/atextcrawler/assets/top_1e4
+++ b/src/atextcrawler/assets/top_1e4
--- a/src/atextcrawler/config.py
+++ b/src/atextcrawler/config.py
@ -0,0 +1,337 @@
+"""
+Configuration loader and validator.
+"""
+
+import os
+import re
+import sys
+from io import TextIOBase
+from pathlib import Path
+from typing import Any, Optional, Union
+
+from voluptuous import All
+from voluptuous import Any as VAny
+from voluptuous import Invalid, Length, Range, Required, Schema, Url
+from yaml import load
+
+try:
+    from yaml import CLoader as Loader  # type: ignore
+except ImportError:
+    from yaml import Loader  # type: ignore
+
+
+class ConfigError(Exception):
+    """
+    Application configuration error.
+    """
+
+    def __init__(self, err):
+        self.msg = str(err)
+
+    def __str__(self):
+        return f'Application configuration error: {self.msg}'
+
+
+class Config:
+    """
+    Application configuration.
+
+    Access the full application configuration using :meth:`get`.
+
+    It is a dictionary with these keys:
+
+      * 'directory': the configuration directory being used
+      * 'main': the main configuration from main.yaml, but
+        postgresql configuration may be overriden by environment
+        variable ATEXTCRAWLER_POSTGRESQL
+    """
+
+    config = None
+
+    @classmethod
+    def get(
+        cls,
+        out: Optional[TextIOBase] = None,
+    ) -> Optional[dict]:
+        """
+        Load and validate app configuration if not already done; return it.
+
+        On errors print them to *out* and if out is sys.stdout, then
+        also exit with exit code 2. Otherwise just return None.
+        """
+        if cls.config:
+            return cls.config
+        if out is None:
+            out = sys.stdout  # type: ignore
+        _config = _load_config()
+        msg = None
+        if isinstance(_config, ConfigError):
+            msg = f'ERROR: configuration could not be loaded: {_config}'
+        else:
+            config = _validate_config(_config)
+            if isinstance(config, ConfigError):
+                config_dir = _config.get('config_dir')
+                msg = (
+                    f'ERROR: invalid configuration in {config_dir}:'
+                    f' {config}'
+                )
+        if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
+            print(msg, file=out)
+            if out == sys.stdout:
+                sys.exit(2)
+            else:
+                return None
+        config['postgresql']['min_size'] = config['crawl']['workers'] + 2
+        config['postgresql']['max_size'] = config['crawl']['workers'] + 2
+        cls.config = config
+        return config
+
+
+def _load_config() -> Union[ConfigError, dict]:
+    """
+    Load configuration; search in multiple directories.
+
+    We search these locations; the first location containing main.yaml
+    will be used::
+
+      * a directory defined in environment variable ATEXTCRAWLER_CONF
+      * subdir .config/atextcrawler in the user's home (`$HOME`)
+      * /etc/atextcrawler
+
+    In the same directory where this main.conf is located a subdirectory
+    'plugins' must exist and contain the configurations of plugins.
+
+    On failure return the first error and None.
+    Otherwise return None and a dict with these keys:
+
+        * `directory`: the used configuration directory
+        * `main`: the main application configuration
+        * `plugins`: a dict mapping plugins names to plugin configurations
+    """
+    Path(__file__).parent.parent
+    config_dirs = []
+    if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
+        config_dirs.append(Path(env_conf))
+    if env_home := os.environ.get('HOME'):
+        config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
+    config_dirs.append(Path('/etc/atextcrawler'))
+    for config_dir in config_dirs:
+        main_yaml_path = config_dir / 'main.yaml'
+        if main_yaml_path.exists():
+            break
+    else:
+        locs = ', '.join([str(loc) for loc in config_dirs if loc])
+        msg = (
+            f'Missing main.yaml in all config locations: {locs}\n'
+            f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
+            f' to define a custom config directory.'
+        )
+        return ConfigError(msg)
+
+    # load main.yaml
+    try:
+        with main_yaml_path.open() as main_yaml:
+            main_config = load(main_yaml.read(), Loader=Loader)
+    except Exception as err:
+        return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
+
+    # main_config must be a dict
+    if not isinstance(main_config, dict):
+        return ConfigError(f'File {main_yaml_path} must contain a dictionary')
+
+    # postgresql config from environment has precedence
+    postgresql_config = _get_env_postgresql()
+    if isinstance(postgresql_config, ConfigError):
+        return postgresql_config
+    main_config['postgresql'] = postgresql_config or main_config['postgresql']
+
+    main_config['config_dir'] = str(config_dir)
+    return main_config
+
+
+def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
+    """
+    Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
+
+    Return an error or the PostgreSQL config (which can be None if
+    the environment variable is not defined.
+    """
+    env_var = 'ATEXTCRAWLER_POSTGRESQL'
+    value = os.environ.get(env_var, '').strip()
+    if not value:
+        return None
+    param_names = (
+        'host',
+        'port',
+        'database',
+        'user',
+        'password',
+        'schema_name',
+    )
+    re_dsn = re.compile(
+        '((' + '|'.join(param_names) + ')'
+        '=("(((?=[^"\\\\]).|\\\\.)*)"'  # value in double quotes
+        '|\'(((?=[^\'\\\\]).|\\\\.)*)\''  # value in single quotes
+        '|([^"\' ]*)'  # value unquoted
+        ')( |$))+?'
+    )
+    params = {}
+    for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
+        params[varname] = (
+            v3
+            or (v1 or '').replace('\\"', '"')
+            or (v2 or '').replace("\\'", "'")
+        )
+    if 'host' not in params:
+        params['host'] = 'localhost'
+    if 'port' not in params:
+        params['port'] = '5432'
+    if 'schema_name' not in params:
+        params['schema_name'] = 'public'
+    for name in param_names:
+        if name not in params:
+            return ConfigError(
+                f'Missing {name} in environment variable {env_var}'
+            )
+    else:
+        params['port'] = int(params['port'])
+        return params
+
+
+def _validate_config(config: Any) -> Union[ConfigError, dict]:
+    """
+    Validate the given configuration and fill in default values.
+
+    If invalid, return only the first error.
+    Otherwise return the configuration with added default values.
+    """
+    try:
+        return schema_main(config)
+    except Exception as err:
+        return ConfigError(err)
+
+
+def plugins_dir(config):
+    """
+    Validate plugins directory (absolute or relative path).
+
+    If it is a relative path, prepend the config_dir.
+    """
+    config_dir = config['config_dir']
+    plugins_dir = config['plugins_dir']
+    if plugins_dir.startswith('/'):
+        try:
+            plugins_dir = Path(plugins_dir)
+        except:
+            raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
+    else:
+        try:
+            plugins_dir = str(Path(config_dir) / Path(plugins_dir))
+            config['plugins_dir'] = plugins_dir
+        except:
+            raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
+    if not (Path(plugins_dir) / '__init__.py').exists():
+        raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
+    return config
+
+
+def postgresql_identifier(value):
+    """
+    Validate a PostgreSQL identifier.
+    """
+    if not isinstance(value, str) or not re.match(
+        '^[a-z][a-z0-9_]{0,30}$', value
+    ):
+        raise Invalid(
+            f'Invalid PostgreSQL identifier "{value}", '
+            f'pattern must be: [a-z][a-z0-9_]{0,30}'
+        )
+    return value
+
+
+def positive_number(value):
+    """
+    Validate a positive number (int or float).
+    """
+    if (isinstance(value, int) or isinstance(value, float)) and value > 0:
+        return value
+    raise Invalid('Not a positive number')
+
+
+schema_postgresql = Schema(
+    {
+        Required('host'): All(str, Length(min=1)),
+        Required('port', default=5432): All(int, Range(min=0, max=65535)),
+        Required('database'): All(str, Length(min=1)),
+        Required('user'): All(str, Length(min=1)),
+        Required('password'): str,
+        Required('schema_name', default='public'): postgresql_identifier,
+    }
+)
+
+
+schema_crawl = Schema(
+    {
+        Required('workers', default=10): All(int, Range(min=0, max=1000)),
+        Required('site_delay', default=600): positive_number,
+        Required('site_revisit_interval', default=3600): positive_number,
+        Required('resource_delay', default=5): positive_number,
+        Required('full_crawl_interval', default=864000): positive_number,
+        Required('feed_crawl_interval', default=86400): positive_number,
+    }
+)
+
+
+schema_elasticsearch = Schema(
+    {
+        Required('host'): All(str, Length(min=1)),
+        Required('api_key'): All(str, Length(min=1)),
+        Required('id'): All(str, Length(min=1)),
+        Required('index_base_name'): All(str, Length(min=1)),
+    }
+)
+
+
+schema_tensorflow = Schema(
+    {
+        Required('model_server_endpoint'): Url(),
+    }
+)
+
+
+schema_main = Schema(
+    All(
+        {
+            Required('config_dir'): All(str, Length(min=1)),
+            Required(
+                'instance_name', default='atextcrawler'
+            ): postgresql_identifier,
+            Required('instance_type', default='prod'): VAny(
+                'dev',
+                'staging',
+                'prod',
+            ),
+            Required('log_level', default='info'): VAny(
+                'critical',
+                'error',
+                'warning',
+                'info',
+                'debug',
+            ),
+            Required('plugins_dir', default='plugins'): All(
+                str, Length(min=1)
+            ),
+            Required('postgresql'): schema_postgresql,
+            Required('crawl'): schema_crawl,
+            Required('elasticsearch'): schema_elasticsearch,
+            Required('tensorflow'): schema_tensorflow,
+        },
+        plugins_dir,
+    )
+)
+
+
+if __name__ == '__main__':
+    from pprint import pprint
+
+    pprint(Config().get())
--- a/src/atextcrawler/crawl.py
+++ b/src/atextcrawler/crawl.py
@ -0,0 +1,215 @@
+"""
+Crawl a site.
+"""
+
+import logging
+from datetime import datetime
+
+import aiohttp
+
+from .models import Crawl
+from .resource import ResourceFetcher, get_site_path, process_site_path
+from .site import (
+    RobotsInfo,
+    checkin_site,
+    checkout_site,
+    fetch_feeds,
+    process_site,
+    update_site,
+)
+from .tensorflow import TensorFlow
+
+logger = logging.getLogger(__name__)
+
+
+class CrawlWorker:
+    """
+    Worker fetching sites, crawling their resources and storing statistics.
+    """
+
+    def __init__(self, app, worker_number, pool):
+        self.app = app
+        self.worker_number = worker_number
+        self.pool = pool
+        self.site_delay = self.app.config['crawl']['site_delay']
+        self.resource_delay = self.app.config['crawl']['resource_delay']
+        self.site = None
+        self.crawl = None
+        self.running = True  # do crawl
+
+    def __await__(self):
+        return self.__ainit__().__await__()
+
+    async def __ainit__(self):
+        await self.startup()
+        return self
+
+    async def startup(self):
+        """
+        Asynchronous startup.
+        """
+        logger.info(f'Starting worker {self.worker_number}')
+        self.conn = await self.pool.acquire()
+        self.session = aiohttp.ClientSession()
+        self.fetcher = ResourceFetcher(self.session)
+        self.tf = TensorFlow(self.app, self.session)
+
+    async def shutdown(self):
+        """
+        Asynchronous shutdown.
+        """
+        logger.info(f'Shutting down worker {self.worker_number}')
+        await self.session.close()
+        await self.pool.release(self.conn)
+
+    async def run(self):
+        """
+        Worker loop: fetch a site, crawl its resources and store statistics.
+
+        If no site needs to be crawled, sleep for self.site_delay seconds
+        (configured in crawl.site_delay).
+        """
+        await self.app.sleep(2)
+        while self.app.running and self.running:
+            self.site, is_full, more = await checkout_site(self.app, self.conn)
+            if not self.site:
+                msg = f'Worker {self.worker_number}: sites exhausted'
+                logger.debug(msg)
+                if not more:
+                    await self.app.sleep(self.site_delay)
+                continue
+            self.crawl = await get_or_create_crawl(
+                self.conn, self.site.id_, is_full
+            )
+            try:
+                if is_full:
+                    site_upd, _ = await update_site(
+                        self.app,
+                        self.fetcher,
+                        self.conn,
+                        self.site.base_url,
+                        site=self.site,
+                    )
+                    if site_upd and site_upd.crawl_enabled:
+                        self.site = site_upd
+                        await process_site(
+                            self.fetcher,
+                            self.conn,
+                            self.site,
+                        )
+                elif self.site.crawl_enabled:
+                    await fetch_feeds(self.fetcher, self.conn, self.site)
+                if self.site.crawl_enabled:
+                    await self.crawl_resources()
+            except:
+                msg = (
+                    f'Worker {self.worker_number} failed crawl'
+                    f' {self.crawl.id_} of site {self.site.id_}'
+                    f' ({self.site.base_url})'
+                )
+                logger.exception(msg)
+            await self.crawl.finish(
+                self.conn, self.app.running and self.running
+            )
+            await checkin_site(self.app, self.conn, self.site, self.crawl)
+            msg = (
+                f'Worker {self.worker_number} finished crawl'
+                f' {self.crawl.id_}'
+            )
+            logger.debug(msg)
+            self.site = None
+            # if we were cancelled, but the app is still running, run again
+            if self.app.running:
+                self.running = True
+        msg = f'Closing crawler {self.worker_number}'
+        logger.debug(msg)
+
+    async def crawl_resources(self):
+        """
+        Loop over resources of the site and process them. Collect statistics.
+
+        All workers operate on distinct sites, so no need for locking here.
+        """
+        crawl_type = 'full' if self.crawl.is_full else 'feed'
+        msg = (
+            f'Worker {self.worker_number} beginning'
+            f' {crawl_type} crawl {self.crawl.id_}'
+            f' of site {self.site.id_} ({self.site.base_url})'
+        )
+        logger.info(msg)
+        resource_delay = self.resource_delay
+        robots = await RobotsInfo(self.site.base_url)
+        if robots.delay:
+            resource_delay = robots.delay
+        while self.app.running and self.running:
+            site_path = await get_site_path(
+                self.conn,
+                self.site,
+                self.crawl.t_begin,
+                only_new=not self.crawl.is_full,
+            )
+            if not site_path:
+                msg = (
+                    f'Worker {self.worker_number} ending crawl'
+                    f' {self.crawl.id_}: paths exhausted'
+                )
+                logger.info(msg)
+                return
+            try:
+                sp_filter = self.app.plugins['filter_site_path'].sp_filter
+                if sp_filter(self.site, site_path.path, robots):
+                    is_new_resource = await process_site_path(
+                        self.app,
+                        self.worker_number,
+                        self.conn,
+                        self.fetcher,
+                        self.tf,
+                        self.site,
+                        site_path,
+                    )
+                    if is_new_resource:
+                        self.crawl.n_resources_new += 1
+                    if is_new_resource is not None:
+                        self.crawl.n_resources += 1
+                    await self.app.sleep(resource_delay)
+                else:
+                    sql = (
+                        "UPDATE site_path SET"
+                        " last_visit=now() at time zone 'UTC',"
+                        " filtered=true"
+                        " WHERE id=$1"
+                    )
+                    await self.conn.execute(sql, site_path.id_)
+            except:
+                msg = (
+                    f'Worker {self.worker_number} processing path failed'
+                    f' in crawl {self.crawl.id_}: {site_path}'
+                )
+                logger.exception(msg)
+                site_path.ok_count -= 1
+                await site_path.save(self.conn)
+        msg = (
+            f'Worker {self.worker_number}: stopped crawl' f' {self.crawl.id_}'
+        )
+        logger.info(msg)
+
+
+async def get_or_create_crawl(conn, site_id, is_full=True) -> Crawl:
+    """
+    Return a new or existing+unfinished crawl.
+
+    If an existing crawl is found, return it, disregarding whether
+    it is a full crawl or not.
+    """
+    sql = "SELECT * FROM crawl WHERE site_id=$1 AND t_end is null LIMIT 1"
+    if row := await conn.fetchrow(sql, site_id):
+        return await Crawl().load_from_row(row)
+    else:
+        # create a new crawl
+        crawl = Crawl(
+            site_id=site_id,
+            is_full=is_full,
+            t_begin=datetime.utcnow(),
+        )
+        await crawl.save(conn)
+        return crawl
--- a/src/atextcrawler/db.py
+++ b/src/atextcrawler/db.py
@ -0,0 +1,162 @@
+"""
+PostgreSQL connectivity.
+
+PGPool can be used as context manager. It takes postgresql configuration
+parameters and gives a connection pool.
+"""
+
+import logging
+import sys
+from io import TextIOBase
+from pathlib import Path
+from traceback import format_exc
+from typing import Dict
+
+import asyncpg
+
+from .utils.json import json_dumps, json_loads
+
+logger = logging.getLogger(__name__)
+
+
+class PGPool:
+    """
+    Database connectivity: Provide a connection pool.
+
+    Can be used either as async context manager (giving a pool),
+    or as a class using async init and the shutdown method and
+    having the pool attribute.
+
+    After startup self.pool contains a PostgreSQL connection pool
+    (instance of :class:`asyncpg.pool.Pool`).
+
+    Startup also runs schema migrations (cf. directory `migrations`).
+    """
+
+    def __init__(
+        self,
+        postgresql_config: dict,
+        out: TextIOBase = None,
+        check: bool = True,
+    ) -> None:
+        self.conf = postgresql_config
+        self.out = out or sys.stdout
+        self.check = check
+        self.pool = None
+
+    def __await__(self):
+        return self.__ainit__().__await__()
+
+    async def __ainit__(self):
+        await self.__aenter__()
+        return self
+
+    async def __aenter__(self):
+        """
+        Return the connection pool after an optional check.
+
+        The check tests basic database access and runs missing migrations.
+        If the check fails, return None.
+        """
+        pool_params = {
+            key: val
+            for key, val in self.conf.items()
+            if key
+            in (
+                'host',
+                'port',
+                'database',
+                'user',
+                'password',
+                'max_size',
+                'min_size',
+            )
+        }
+        pool_params['command_timeout'] = 30
+        self.pool = await asyncpg.create_pool(**pool_params, init=self._init)
+        if self.check:
+            async with self.pool.acquire() as conn:
+                if await self.check_or_migrate(conn):
+                    return self.pool
+
+    @staticmethod
+    async def _init(conn) -> None:
+        """
+        Add JSON encoding and decoding to the given connection.
+        """
+        await conn.set_type_codec(
+            'jsonb',
+            encoder=json_dumps,
+            decoder=json_loads,
+            schema='pg_catalog',
+        )
+
+    async def __aexit__(self, exc_type, exc, tb) -> None:
+        """
+        Close the connection pool.
+        """
+        await self.shutdown()
+
+    async def shutdown(self):
+        """
+        Close the pool.
+        """
+        await self.pool.close()
+
+    async def check_or_migrate(self, conn: asyncpg.Connection) -> bool:
+        """
+        Check database connectivity.
+
+        Return whether database connectivity is working.
+        """
+        row = await conn.fetchrow('SELECT 1+1 AS result')
+        if not row or row.get('result') != 2:
+            msg = 'Database SELECT 1+1 not working; missing privileges?'
+            print(msg, file=self.out)
+            logger.critical(msg)
+            return False
+
+        # determine current schema_version
+        try:
+            sql = "SELECT value::int FROM kvs WHERE key='schema_version'"
+            schema_version = await conn.fetchval(sql)
+        except:
+            schema_version = 0
+
+        # run missing migrations
+        migrations = get_migrations()
+        for number, text in sorted(migrations.items()):
+            if number > schema_version:
+                cmds = text.split('\n----\n')
+                for cmd in cmds:
+                    if not cmd.strip():
+                        continue
+                    try:
+                        await conn.execute(cmd)
+                    except:
+                        msg = (
+                            f'Exception during migration {number} in '
+                            f'statement\n{cmd}'
+                        )
+                        print(msg, file=self.out)
+                        logger.critical(msg)
+                        print(format_exc(), file=self.out)
+                        logger.critical(format_exc())
+                        return False
+
+        # return success
+        return True
+
+
+def get_migrations() -> Dict[int, str]:
+    """
+    Return migrations (number and text content of migration file).
+    """
+    migrations_dir = Path(__file__).parent / 'migrations'
+    migrations = {}
+    for migration_file in migrations_dir.glob('*.sql'):
+        migration_number = int(migration_file.name[:-4])
+        with migration_file.open() as mig_file:
+            content = mig_file.read()
+        migrations[migration_number] = content
+    return migrations
--- a/src/atextcrawler/migrations/1.sql
+++ b/src/atextcrawler/migrations/1.sql
@ -0,0 +1,297 @@
+CREATE TABLE kvs (
+    id bigserial PRIMARY KEY,
+    t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
+    key varchar(200) NOT NULL UNIQUE,
+    value jsonb
+)
+----
+COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry';
+----
+COMMENT ON COLUMN kvs.key IS 'Key';
+----
+COMMENT ON COLUMN kvs.value IS 'Value';
+----
+COMMENT ON TABLE kvs IS 'Simple key-value store';
+----
+INSERT INTO kvs (key, value) VALUES ('schema_version', '1');
+----
+CREATE TABLE site (
+    id bigserial PRIMARY KEY,
+    canonical_url varchar(200),
+    base_url varchar(200) NOT NULL,
+    base_urls varchar(200)[] NOT NULL,
+    domains varchar(100)[],
+    ips inet[] NULL,
+    crawl_enabled bool NOT NULL DEFAULT false,
+    crawl_active bool NOT NULL DEFAULT false,
+    next_full_crawl timestamp,
+    next_feed_crawl timestamp,
+    last_update timestamp,
+    last_pub timestamp,
+    pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb,
+    langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[],
+    alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb,
+    title varchar(200),
+    description varchar(2000),
+    keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[],
+    linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb,
+    meta_info jsonb NOT NULL DEFAULT '{}'::jsonb,
+    boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb
+)
+----
+CREATE INDEX site__base_url ON site (base_url)
+----
+CREATE INDEX site__base_urls ON site (base_urls)
+----
+CREATE INDEX site__domains ON site (domains)
+----
+CREATE INDEX site__ips ON site (ips)
+----
+CREATE INDEX site__next_full_crawl ON site (next_full_crawl)
+----
+CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl)
+----
+CREATE INDEX site__langs ON site (langs)
+----
+CREATE INDEX site__title ON site (title)
+----
+CREATE INDEX site__description ON site (description)
+----
+CREATE INDEX site__keywords ON site (keywords)
+----
+COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)'
+----
+COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content'
+----
+COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content'
+----
+COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls'
+----
+COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed'
+----
+COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress'
+----
+COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null'
+----
+COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null'
+----
+COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)'
+----
+COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site'
+----
+COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date'
+----
+COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)'
+----
+COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes'
+----
+COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags'
+----
+COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags'
+----
+COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags'
+----
+COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)'
+----
+COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information'
+----
+COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages'
+----
+COMMENT ON TABLE site IS 'Website'
+----
+CREATE TABLE site_queue (
+    id bigserial PRIMARY KEY,
+    src bigint NULL REFERENCES site(id) ON DELETE CASCADE,
+    url varchar(200) NOT NULL,
+    link_text varchar(100),
+    t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc')
+)
+----
+CREATE INDEX site_queue__url ON site_queue (url)
+----
+COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions'
+----
+COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path'
+----
+COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site'
+----
+COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry'
+----
+COMMENT ON TABLE site_queue IS 'Queued site URLs'
+----
+CREATE TABLE site_feed (
+    id bigserial PRIMARY KEY,
+    site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
+    url varchar(200) NOT NULL,
+    etag text,
+    modified varchar(50),
+    t_visit timestamp,
+    t_content timestamp,
+    version varchar(10),
+    title varchar(200),
+    description text,
+    fail_count smallint NOT NULL DEFAULT 0
+)
+----
+CREATE INDEX site_feed__site ON site_feed (site_id)
+----
+CREATE INDEX site_feed__t_content ON site_feed (t_content)
+----
+COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found'
+----
+COMMENT ON COLUMN site_feed.url IS 'URL of the feed'
+----
+COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed'
+----
+COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed'
+----
+COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival'
+----
+COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval'
+----
+COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival'
+----
+COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival'
+----
+COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival'
+----
+COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival'
+----
+CREATE TABLE site_link (
+    id bigserial PRIMARY KEY,
+    src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
+    dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
+    t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
+    link_text varchar(100)
+)
+----
+ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst)
+----
+CREATE INDEX site_link__src ON site_link (src)
+----
+CREATE INDEX site_link__dst ON site_link (dst)
+----
+COMMENT ON COLUMN site_link.src IS 'Source site'
+----
+COMMENT ON COLUMN site_link.dst IS 'Destination site'
+----
+COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry'
+----
+COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site'
+----
+COMMENT ON TABLE site_link IS 'Cross-site link'
+----
+CREATE TABLE resource (
+    id bigserial PRIMARY KEY,
+    simhash bigint,
+    content_type varchar(50),
+    last_change timestamp,
+    text_len int,
+    lang char(2),
+    title varchar(200),
+    summary varchar(2000)
+)
+----
+COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource'
+----
+COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header'
+----
+COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource'
+----
+COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters'
+----
+COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code'
+----
+COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)'
+----
+COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)'
+----
+COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)'
+----
+CREATE TABLE site_path (
+    id bigserial PRIMARY KEY,
+    site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
+    path varchar(400) NOT NULL,
+    last_visit timestamp,
+    filtered bool NOT NULL DEFAULT false,
+    ok_count smallint NOT NULL DEFAULT 0,
+    canonical bool,
+    resource_id bigint REFERENCES resource(id) ON DELETE CASCADE
+)
+----
+ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path)
+----
+CREATE INDEX site_path__site_path ON site_path (site_id, path)
+----
+CREATE INDEX site_path__resource ON site_path (resource_id)
+----
+COMMENT ON COLUMN site_path.site_id IS 'Site id'
+----
+COMMENT ON COLUMN site_path.path IS 'Path'
+----
+COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival'
+----
+COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed'
+----
+COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival'
+----
+COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval'
+----
+COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources'
+----
+CREATE TABLE crawl (
+    id bigserial PRIMARY KEY,
+    site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
+    is_full bool NOT NULL DEFAULT false,
+    t_begin timestamp,
+    t_end timestamp,
+    n_resources int NOT NULL DEFAULT 0,
+    n_resources_new int NOT NULL DEFAULT 0
+)
+----
+CREATE INDEX crawl__site ON crawl (site_id)
+----
+CREATE INDEX crawl__t_begin ON crawl (t_begin)
+----
+COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled'
+----
+COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl'
+----
+COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl'
+----
+COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin'
+----
+COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl'
+----
+COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl'
+----
+COMMENT ON TABLE resource IS 'Crawl of resources on a site'
+----
+CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale')
+----
+COMMENT ON TYPE site_annotation_type IS 'Type of site annotation'
+----
+CREATE TABLE site_annotation (
+    id bigserial PRIMARY KEY,
+    site_id bigint REFERENCES site(id) ON DELETE SET NULL,
+    base_url varchar(200) NOT NULL,
+    ann_type site_annotation_type NOT NULL,
+    ann_content JSONB,
+    t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc')
+)
+----
+CREATE INDEX site_annotation__site ON site_annotation (site_id)
+----
+CREATE INDEX site_annotation__base_url ON site_annotation (base_url)
+----
+COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated'
+----
+COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated'
+----
+COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type'
+----
+COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content'
+----
+COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update'
+----
+COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'
--- a/src/atextcrawler/models.py
+++ b/src/atextcrawler/models.py
@ -0,0 +1,610 @@
+"""
+Data Models.
+"""
+
+import logging
+from dataclasses import InitVar, asdict, dataclass, field, fields
+from datetime import date, datetime
+from itertools import chain
+from typing import Any, ClassVar, Optional
+
+import tldextract
+from asyncpg import Connection
+
+from .search import delete_resource
+from .utils.durl import Durl, get_url_variants
+from .utils.link import extract_domain
+from .utils.similarity import get_simhash, simhash_to_bigint
+
+logger = logging.getLogger(__name__)
+
+
+class ModelBase:
+    """
+    Abstract base class for models.
+
+    Execute SQL to load, save, delete instances using asyncpg.
+    """
+
+    table: ClassVar
+    id_: Optional[int] = 0
+
+    async def load(self, conn: Connection, id_: int) -> Optional[Any]:
+        """
+        If loading fails, return None.
+        """
+        sql = f"SELECT * FROM {self.table} WHERE id=$1"
+        row = await conn.fetchrow(sql, id_)
+        if not row:
+            return None
+        return await self.load_from_row(row)
+
+    async def load_from_row(self, row):
+        """
+        If row is None, return None.
+        """
+        if not row:
+            return None
+        data = dict(row)
+        self.id_ = data.pop('id')
+        self.__init__(**data)
+        return self
+
+    async def save(self, conn: Connection) -> None:
+        """
+        Save the instance (update if self.id_ is set, else insert).
+        """
+        data = asdict(self)
+        # logger.debug(f'Save {self}: id_={self.id_}')
+        if self.id_:  # update
+            cols = ', '.join(data.keys())
+            upds = ', '.join(
+                [f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
+            )
+            val_id = f'${len(data) + 1}'
+            sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
+            await conn.execute(sql, *data.values(), self.id_)
+        else:  # insert
+            cols = ', '.join(data.keys())
+            vals = ', '.join([f'${i + 1}' for i in range(len(data))])
+            sql = (
+                f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
+                f" RETURNING id"
+            )
+            self.id_ = await conn.fetchval(sql, *data.values())
+
+    def asdict(self):
+        """
+        Return instance data as dictionary.
+        """
+        return asdict(self)
+
+    async def delete(self, conn: Connection) -> None:
+        """
+        Delete the object if it has an id_.
+        """
+        if self.id_:
+            sql = f"DELETE FROM {self.table} WHERE id=$1"
+            await conn.execute(sql, self.id_)
+
+
+class ResourceError:
+    """
+    Error encountered while trying to fetch a resource.
+
+    ResourceError is used for cases when fetching a resource fails.
+    """
+
+    def __init__(self, msg, status=None, headers=None):
+        self.msg = msg
+        self.status = status
+        self.headers = headers
+
+    def __repr__(self):
+        return f'ResourceError: {self.msg}'
+
+
+class ResourceRedirect:
+    """
+    A resource containing a redirect.
+    """
+
+    def __init__(self, urls):
+        self.urls = urls
+
+
+@dataclass
+class TextResource(ModelBase):
+    """
+    TextResource (without path).
+
+    TextResource models web resources with relevant text content.
+    They are instantiated in modules page, document, ...; their metadata
+    are stored in table `resource` and the text content is stored with the
+    search engine.
+
+    Do not confuse with SitePath: Several SitePath instances
+    may point to a TextResource. The TextResource holds the actual content.
+
+    If we are not dealing with the startpage of a new site,
+    the init_fields dict usually will contain the site to which
+    the resource belongs.
+    """
+
+    table: ClassVar = 'resource'
+    init_fields: InitVar[dict] = None  # additional fields after fetching
+    search_fields: InitVar[dict] = None  # additional fields for indexing
+
+    # database fields
+    simhash: Optional[int] = None
+    content_type: Optional[str] = None
+    last_change: Optional[datetime] = None
+    text_len: int = 0
+    lang: Optional[str] = None
+    title: Optional[str] = None
+    summary: Optional[str] = None
+
+    def __post_init__(self, init_fields, search_fields):
+        if init_fields is None:
+            init_fields = {}
+        self.init_fields = init_fields
+        if search_fields is None:
+            search_fields = {}
+        self.search_fields = search_fields
+        self.site = self.init_fields.get('site')
+        self.site_id = self.site.id_ if self.site else None
+        self._update_simhash()
+
+    def __str__(self):
+        return (
+            f'TextResource(id={self.id_},'
+            f' site_id={self.site_id},'
+            f' type={self.content_type})'
+        )
+
+    def _update_simhash(self):
+        """
+        Update the simhash of the resource from its text content.
+        """
+        if self.simhash is None:
+            text = self.search_fields.get('text', '')
+            self.simhash = simhash_to_bigint(get_simhash(text))
+
+    async def save(self, conn: Connection):
+        """
+        Save the instance, extending the parent's method.
+        """
+        self.content_type = (
+            self.content_type[:50] if self.content_type else None
+        )
+        self.title = self.title[:200] if self.title else None
+        self.summary = self.summary[:400] if self.summary else None
+        self._update_simhash()
+        if self.last_change is None:
+            self.last_change = datetime.utcnow()
+        await super().save(conn)
+
+    async def update_from_resource(self, upd: 'TextResource'):
+        """
+        Update self with values from another resource.
+        """
+        names = [field.name for field in fields(self)]
+        for name in names:
+            cur_val = getattr(self, name)
+            upd_val = getattr(upd, name)
+            if not cur_val and upd_val is not None:
+                setattr(self, name, upd_val)
+        init_names = [
+            'headers',
+            'redirects',
+            'links_int',
+            'links_ext',
+            'shortlinks',
+            'canonical',
+            #'head',
+        ]
+        self.init_fields = upd.init_fields
+        self.search_fields = upd.search_fields
+        # for init_name in init_names:
+        #    cur_val = self.init_fields.get(init_name)
+        #    upd_val = upd.init_fields.get(init_name)
+        #    if not cur_val and upd_val is not None:
+        #        self.init_fields[init_name] = upd_val
+
+
+@dataclass
+class MetaResource(ModelBase):
+    """
+    Parent class for Feed, Sitemap, SitemapIndex.
+
+    MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
+    Their instances are not stored. Note: class Feed contains feed meta data
+    and is stored in the database.
+    """
+
+
+@dataclass
+class SitemapIndex(MetaResource):
+    """
+    A SitemapIndex meta resource.
+
+    Just a list of the siteap URLs, nothing more.
+    """
+
+    sitemaps: list = field(default_factory=list)
+
+
+@dataclass
+class Sitemap(MetaResource):
+    """
+    A Sitemap meta resource.
+
+    Just a list of the resulting links, nothing more.
+    """
+
+    urls: list = field(default_factory=list)
+
+
+@dataclass
+class Feed(MetaResource):
+    """
+    A site's feed (RSS, Atom , ...).
+    """
+
+    table: ClassVar = 'site_feed'
+    entries: InitVar[list] = None
+    site_id: Optional[int] = None
+    url: Optional[str] = None
+    etag: Optional[str] = None
+    modified: Optional[str] = None
+    t_visit: Optional[datetime] = None
+    t_content: Optional[datetime] = None
+    version: Optional[str] = None
+    title: Optional[str] = None
+    description: Optional[str] = None
+    fail_count: int = 0
+
+    def __post_init__(self, entries):
+        self.entries = entries
+
+    def __str__(self):
+        return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'
+
+    async def save(self, conn: Connection):
+        """
+        Save, trying to merge with existing entry matching on site_id and url.
+        """
+        if not self.site_id or not self.url:
+            msg = f'Saving feed failed: missing site_id of url'
+            logger.error(msg)
+            return
+        sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
+        self.id_ = await conn.fetchval(sql, self.site_id, self.url)
+        await super().save(conn)
+
+    def debug(self) -> str:
+        """
+        Return the instance data asa string for debug print output.
+        """
+        return (
+            f'Feed:\n'
+            f'- id: {self.id_}\n'
+            f'- site_id: {self.site_id}\n'
+            f'- url: {self.url}\n'
+            f'- etag: {self.etag}\n'
+            f'- modified: {self.modified}\n'
+            f'- t_visit: {self.t_visit}\n'
+            f'- t_content: {self.t_content}\n'
+            f'- version: {self.version}\n'
+            f'- title: {self.title}\n'
+            f'- description: {self.description}\n'
+            f'- fail_count: {self.fail_count}\n'
+            f'- entries: {self.entries}'
+        )
+
+
+@dataclass
+class Site(ModelBase):
+    """
+    Website.
+    """
+
+    table: ClassVar = 'site'
+    base_durl: InitVar[Durl] = None
+    feeds: InitVar[dict] = None
+    links_ext: InitVar[dict] = None
+    links_int: InitVar[dict] = None
+    startpage_text: InitVar[str] = None
+
+    canonical_url: Optional[str] = None
+    base_url: Optional[str] = None
+    base_urls: list[str] = field(default_factory=list)
+    domains: list[str] = field(default_factory=list)
+    ips: Optional[list[str]] = None
+    crawl_enabled: bool = False
+    crawl_active: bool = False
+    next_full_crawl: Optional[datetime] = None
+    next_feed_crawl: Optional[datetime] = None
+    last_update: Optional[datetime] = None
+    last_pub: Optional[datetime] = None
+    pub_dates: Optional[dict[str, str]] = None
+    langs: list[str] = field(default_factory=list)
+    alt_langs: dict[str, str] = field(default_factory=dict)
+    title: Optional[str] = None
+    description: Optional[str] = None
+    keywords: list[str] = field(default_factory=list)
+    linkbacks: dict[str, str] = field(default_factory=dict)
+    meta_info: dict = field(default_factory=dict)
+    boilerplate_texts: list[str] = field(default_factory=list)
+
+    def __post_init__(
+        self,
+        base_durl: Durl,
+        feeds=None,
+        links_ext=None,
+        links_int=None,
+        startpage_text=None,
+    ):
+        self.feeds = feeds
+        self.links_ext = links_ext
+        self.links_int = links_int
+        self.startpage_text = startpage_text
+        self.keywords = self.keywords[:20]
+        if not self.last_update:
+            self.last_update = datetime.utcnow()
+        pub_date: Optional[str]
+        if self.last_pub:
+            pub_date = date.isoformat(self.last_pub.date())
+            self.pub_dates = {date.isoformat(self.last_update): pub_date}
+        else:
+            pub_date = None
+            self.pub_dates = {}
+        if base_durl:
+            self.base_urls = [base_durl.url()[:200]]
+            self.domains = [extract_domain(base_durl.hostname)[:100]]
+
+    def __str__(self):
+        return (
+            f'Site(id={self.id_}, url={self.base_url},'
+            f' crawl_enabled={self.crawl_enabled})'
+        )
+
+    async def update_base_url(self) -> None:
+        """
+        Update the base_url, choosing the most relevant URL.
+
+        If canonical_url is not None, use this.
+        Otherwise set self.base_url to the shortest from self.base_urls,
+        but requiring a https-url if there is at least one.
+        """
+        if self.canonical_url and self.canonical_url not in self.base_urls:
+            if canonical_durl := await Durl(self.canonical_url):
+                self.base_urls.append(self.canonical_url)
+                domain = extract_domain(canonical_durl.hostname)
+                if domain not in self.domains:
+                    self.domains.append(domain)
+        if self.canonical_url:
+            self.base_url = self.canonical_url
+            return
+        if not self.base_url:
+            url_candidates = self.base_urls
+            if https_urls := [
+                url for url in self.base_urls if url.startswith('https://')
+            ]:
+                url_candidates = https_urls
+            self.base_url = min(url_candidates, key=len)
+
+    async def save(  # type: ignore
+        self, conn, merge=True
+    ) -> tuple[Optional[int], bool]:
+        """
+        Store the site, optionally trying to merge it with an existing site.
+
+        Return the id of the saved instance and whether a new instance
+        was created.
+
+        If self.id_ is not 0, replace the data of the existing site with
+        this id. Else if not merge, store as new row, and if merge,
+        try to merge with an existing matching site.
+        """
+        await self.update_base_url()
+        if not merge:
+            created = not bool(self.id_)
+            await super().save(conn)
+            return self.id_, created
+        if self.id_:
+            sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
+            row = await conn.fetchrow(sql, self.id_)
+            self.base_urls = list(
+                set(row['base_urls']).union(set(self.base_urls))
+            )
+            if previous_pub_dates := row['pub_dates']:
+                if not self.pub_dates:
+                    self.pub_dates = {}
+                self.pub_dates.update(previous_pub_dates)
+            await super().save(conn)
+            return self.id_, False
+        same_site_id = await search_same_site(self, conn)
+        if same_site_id:
+            same_site = await Site().load(conn, same_site_id)
+        if same_site_id and same_site:
+            same_site.base_urls = set(same_site.base_urls).union(
+                set(self.base_urls)
+            )
+            same_site.domains = set(same_site.domains).union(set(self.domains))
+            if self.canonical_url and not same_site.canonical_url:
+                same_site.canonical_url = self.canonical_url
+            await same_site.save(conn, merge=False)  # call ourselves
+            self.id_ = same_site.id_
+            return self.id_, False
+        else:
+            await super().save(conn)
+            return self.id_, True
+
+
+@dataclass
+class SitePath(ModelBase):
+    """
+    Path of a website. May point to a Resource.
+    """
+
+    table: ClassVar = 'site_path'
+    site: InitVar[str] = None
+
+    site_id: Optional[int] = None
+    path: Optional[str] = None
+    filtered: bool = False
+    last_visit: Optional[datetime] = None
+    ok_count: int = 0
+    canonical: Optional[bool] = None
+    resource_id: Optional[int] = None
+
+    def __str__(self):
+        return (
+            f'SitePath(id={self.id_}, site_id={self.site_id},'
+            f' path={self.path})'
+        )
+
+    async def save(self, conn: Connection):
+        """
+        Save the instance, extending the parent's method.
+        """
+        self.path = self.path[:400] if self.path else ''
+        await super().save(conn)
+
+    async def unlink_resource(self, conn, engine, index_base_name):
+        """
+        Unlink the resource and also delete it, if it has no more links.
+        """
+        if self.id_:
+            if self.resource_id:
+                sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
+                ref_count = await conn.fetchval(sql, self.resource_id)
+                if ref_count == 0:
+                    sql = (
+                        "DELETE FROM resource WHERE id=$1"
+                        " RETURNING (true, lang)"
+                    )
+                    found = await conn.fetchval(sql, self.resource_id)
+                    if found:
+                        await delete_resource(
+                            engine, found[1], self.resource_id
+                        )
+                self.resource_id = None
+
+    def url(self, site):
+        """
+        Return the full URL (combine the site's base_url with our path).
+        """
+        return site.base_url + self.path
+
+
+@dataclass
+class Crawl(ModelBase):
+    """
+    The crawl process of a website (begin, end, statistics, ...).
+    """
+
+    table: ClassVar = 'crawl'
+    site_id: Optional[int] = None
+    is_full: bool = False
+    t_begin: datetime = datetime.utcnow()
+    t_end: Optional[datetime] = None
+    n_resources: int = 0
+    n_resources_new: int = 0
+
+    async def finish(self, conn, set_t_end):
+        """
+        Save the crawl. Set t_end only if indicated.
+        """
+        if set_t_end:
+            self.t_end = datetime.utcnow()
+        await self.save(conn)
+
+
+async def search_same_site(
+    site: Site,
+    conn: Connection,
+) -> Optional[int]:
+    """
+        Try to find a matching site for the given *site* and return its id.
+
+    TODO: if the path is non-trivial, require it also for the matching site
+
+        Two sites match when they return the same content for identical paths.
+        The base_url (scheme and/or netloc) may differ.
+        We do not have the content for all paths of both websites, so we need
+        to estimate: We only take into account meta information from the
+        start pages of both sites, in particular the title, description
+        and information obtained the base_urls:
+
+        We use a combination of these conditions:
+
+          1. one of the sites has a canonical URL which matches the
+             URL of the other site
+          2. the content fields (title, description) have sufficient information
+          3. the content fields match exactly
+          4. the domain matches
+          5. the domain matches, except for the TLD
+          6. the base_urls differ in their schemes (http vs. https)
+          7. the hostnames in the base_urls are identical
+          8. the hostnames in the base_urls differ by a prepended 'www.'
+          9. the IPs have at least one common address
+
+        The algorithm is this (first answer is final, yes means match):
+
+          * if (1) : yes
+          * if (2), (3), (4) : yes
+          * if (2), (3), (5), (9) : yes
+          * if (6), ((7) or (8)) : yes
+          * no
+    """
+    # rule (1)
+    if site.canonical_url:
+        sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
+        id_ = await conn.fetchval(sql, site.canonical_url)
+        if id_:
+            return id_
+    else:
+        sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
+        id_ = await conn.fetchval(sql, site.base_urls)
+        if id_:
+            return id_
+
+    # rule (6), ((7) or (8))
+    url_variants = set(
+        chain.from_iterable(
+            get_url_variants(base_url) for base_url in site.base_urls
+        )
+    )
+    sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
+    if id_ := await conn.fetchval(sql, url_variants):
+        return id_
+
+    # condition (2)
+    if len(site.title or '') > 15 or len(site.description or '') > 15:
+        sql = (
+            f"SELECT * FROM site WHERE"
+            f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
+        )
+        rows = await conn.fetch(sql, site.title or '', site.description or '')
+        # condition (3)
+        if rows:
+            # condition (4)
+            for row in rows:
+                domains = set(row.get('domains', []))
+                if domains & set(site.domains):
+                    return row['id']
+            # condition (9)
+            for row in rows:
+                ips = set(row.get('ips', []))
+                if site.ips and ips & set(site.ips):
+                    # condition (5)
+                    domains_ = row.get('domains', [])
+                    d1 = set([tldextract.extract(d).domain for d in domains_])
+                    domains_ = site.domains or []
+                    d2 = set([tldextract.extract(d).domain for d in domains_])
+                    if d1 & d2:
+                        return row['id']
+
+    return None
--- a/src/atextcrawler/plugin_defaults/init.py
+++ b/src/atextcrawler/plugin_defaults/init.py
--- a/src/atextcrawler/plugin_defaults/filter_resource_path.py
+++ b/src/atextcrawler/plugin_defaults/filter_resource_path.py
@ -0,0 +1,22 @@
+"""
+Filter paths found in a resource.
+
+This plugin implements :func:`rp_filter`.
+"""
+
+from typing import Optional
+
+
+def rp_filter(site, durl) -> Optional[str]:
+    """
+    Adjust or filter found paths (may depend on site).
+
+    To filter out a path (i.e., not add it to table `site_path`)
+    return None.
+    """
+    path = durl.pwa()
+    # skip fetching images (linked from a tags; img tags are skipped anyway)
+    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
+        return None
+    path = path.removesuffix('?amp=1')
+    return path
--- a/src/atextcrawler/plugin_defaults/filter_site.py
+++ b/src/atextcrawler/plugin_defaults/filter_site.py
@ -0,0 +1,47 @@
+"""
+Relevance estimation of sites.
+
+This plugin implements :func:`site_filter`.
+"""
+
+import re
+
+from atextcrawler.models import Site
+
+MIN_RELEVANCE_SCORE = 5
+
+
+async def site_filter(site: Site) -> bool:
+    """
+    Assess relevance of the site (using language-dependent criteria).
+
+    If the site shall be crawled, return True, else False.
+    """
+    # limit to sites in English or German language
+    if not set(['de', 'en']) & set(site.langs):
+        return False
+    score = 0.0
+    for crit_name, weight, langs, crit_re in re_criteria:
+        if '*' in langs or set(langs) & set(site.langs):
+            findings = crit_re.findall(site.startpage_text)
+            if findings:
+                score += weight * len(findings)
+            if site.title and crit_re.search(site.title):
+                score += 4 * weight
+            if site.description and crit_re.search(site.description):
+                score += 4 * weight
+
+    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
+
+    return score >= MIN_RELEVANCE_SCORE
+
+
+re_criteria = {
+    (
+        'anarch',
+        1.0,
+        ('*',),
+        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
+    ),
+    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
+}
--- a/src/atextcrawler/plugin_defaults/filter_site_path.py
+++ b/src/atextcrawler/plugin_defaults/filter_site_path.py
@ -0,0 +1,24 @@
+"""
+Plugin for filtering paths of a site to be retrieved.
+
+This plugin implements :func:`sp_filter`.
+"""
+
+
+def sp_filter(site, path, robots) -> bool:
+    """
+    Per-site path filter. Return whether the path shall be retrieved.
+    """
+    if not robots.can_fetch_url(site.base_url + path):
+        return False
+    if 'amusewiki' in site.meta_info.get('generator', '').lower():
+        if any(
+            [
+                path.endswith(end)
+                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
+            ]
+        ):
+            return False
+        if '/bbselect?' in path:
+            return False
+    return True
--- a/src/atextcrawler/resource/init.py
+++ b/src/atextcrawler/resource/init.py
@ -0,0 +1,10 @@
+from .dedup import store_boilerplate_texts
+from .feed import feed_types, update_feed
+from .fetch import ResourceFetcher
+from .operations import (
+    add_site_paths,
+    get_site_path,
+    process_site_path,
+    store_feed_entries,
+)
+from .sitemap import extract_sitemap_paths, get_sitemap_urls
--- a/src/atextcrawler/resource/main.py
+++ b/src/atextcrawler/resource/main.py
@ -0,0 +1,96 @@
+"""
+Dev tool for fetching and displaying a resource.
+
+Has no permanent effects.
+"""
+
+import asyncio
+import logging
+import sys
+from collections import defaultdict
+from pprint import pformat
+
+import aiohttp
+
+from ..models import Feed, TextResource
+from ..resource import ResourceFetcher
+from ..utils.annotation import pack_annotations, unpack_annotations
+from ..utils.durl import Durl
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler())
+logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
+logger_page_debug.setLevel(logging.DEBUG)
+
+
+def add_tags(text, annotations):
+    """
+    Reconstruct html from text and annotations.
+
+    This is very similar to what the client does when displaying
+    a cached hit.
+    """
+    html = ''
+    opening_tags = defaultdict(list)
+    closing_tags = defaultdict(list)
+    anns_tags = sorted(
+        annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
+    )
+    for (i, f), anns in anns_tags:
+        opening_tags[i] += [tag for tag in reversed(anns)]
+        closing_tags[f] += [tag for tag in reversed(anns)]
+    positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
+    last_pos = 0
+    links = {i: href for href, (i, f, rel) in annotations['links'].items()}
+    for pos in positions:
+        html += text[last_pos:pos]
+        closing = closing_tags.get(pos, [])
+        opening = opening_tags.get(pos, [])
+        common = set(closing) & set(opening)
+        closing = [tag for tag in closing if tag not in common]
+        opening = [tag for tag in opening if tag not in common]
+        tags_html = ''
+        for tag in reversed(closing):
+            html += f'</{tag}>\n'
+        for tag in opening:
+            if tag == 'a':
+                href = links.get(pos, '#')
+                html += f'<a href="{href}">'
+            else:
+                html += f'<{tag}>'
+        last_pos = pos
+    return html
+
+
+async def run():
+    """
+    Fetch and display a resource with URL given as cmdline argument.
+    """
+    url = sys.argv[1]
+    async with aiohttp.ClientSession() as session:
+        if not (durl := await Durl(url)):
+            return
+        fetcher = ResourceFetcher(session)
+        resource = await fetcher.fetch(url)
+        if isinstance(resource, TextResource):
+            logger.warning(repr(resource))
+            logger.warning(f'Language: {resource.lang}')
+            logger.warning(pformat(resource.search_fields))
+            logger.warning(pformat(resource.init_fields))
+
+            # annotations = resource.search_fields.get('annotations')
+            # text = resource.search_fields['text']
+            # with open('/tmp/1.html', 'w') as f:
+            #    html = add_tags(text, annotations)
+            #    f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
+            #            f'<body>\n{html}\n</body></html>')
+        elif isinstance(resource, Feed):
+            logger.warning(resource.debug())
+        else:
+            logger.warning(f'Resource has type {type(resource)}')
+            logger.warning(resource)
+
+
+if __name__ == '__main__':
+    asyncio.run(run())
--- a/src/atextcrawler/resource/dedup.py
+++ b/src/atextcrawler/resource/dedup.py
@ -0,0 +1,59 @@
+"""
+Find boilerplate texts.
+"""
+
+from collections import Counter
+
+from ..models import TextResource
+from ..utils.probe import extract_samples
+from ..utils.section import iter_sections
+
+
+async def store_boilerplate_texts(fetcher, conn, site):
+    """
+    Find and store boilerplate texts of a site.
+
+    Fetch the start page and internal sample links obtained from it.
+    If there are sufficienty frequently appearing text sections,
+    consider them as boilerplate texts.
+
+    If boilerplate_texts were found, update the given site instance.
+    """
+    startpage = await fetcher.fetch(site.base_url, site=site)
+    if (
+        not isinstance(startpage, TextResource)
+        or startpage.content_type != 'html'
+    ):
+        return
+
+    # fetch sample resources
+    sample_links = extract_samples(startpage.init_fields['links_int'])
+    resources = [startpage]
+    for sample_link in sample_links:
+        if sample_link.path == site.base_url:  # avoid duplicate resources
+            continue  # NB: duplicate resources may have different paths
+        sample_resource = await fetcher.fetch(sample_link.url(), site=None)
+        if (
+            isinstance(sample_resource, TextResource)
+            and sample_resource.content_type == 'html'
+        ):
+            resources.append(sample_resource)
+
+    # find common texts in resources
+    if (n_resources := len(resources)) > 2:
+        text_freq = Counter()
+        for resource in resources:
+            text = resource.search_fields['text']
+            semantic_breaks = resource.search_fields['annotations'][
+                'semantic_breaks'
+            ]
+            for sec in iter_sections(text, semantic_breaks):
+                text_freq[sec[3]] += 1
+        boilerplate_texts = []
+        if min(text_freq.values() or [0]) == 1:  # no resource fetched twice
+            for text, freq in text_freq.items():
+                if freq > 2:
+                    boilerplate_texts.append(text)
+            sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
+            await conn.execute(sql, boilerplate_texts, site.id_)
+            site.boilerplate_texts = boilerplate_texts
--- a/src/atextcrawler/resource/document.py
+++ b/src/atextcrawler/resource/document.py
@ -0,0 +1,131 @@
+"""
+Parse documents (often application/pdf).
+"""
+
+import logging
+import re
+from datetime import datetime
+from typing import Optional, Union
+
+from tika import parser
+
+from ..models import ResourceError, ResourceRedirect, Site, TextResource
+from ..utils.durl import Durl
+from ..utils.http import get_header_links
+from ..utils.lang import extract_content_language
+from .plaintext import annotate_text
+
+logger = logging.getLogger(__name__)
+logger_debug = logging.getLogger(__name__ + '.debug')
+logger_debug.setLevel(logging.INFO)
+
+
+re_url = re.compile(
+    r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
+    r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
+)
+
+
+async def parse_document(
+    durl: Durl,
+    resp: dict,
+    site: Optional[Site],
+) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
+    """
+    Extract plain text from documents in various formats.
+    """
+    content = resp['content']
+
+    # HTTP headers, canonical URL, shortlink
+    header_links = await get_header_links(resp['headers'], durl, site)
+    if canonical := header_links.get('canonical'):
+        if canonical != durl.url():
+            return ResourceRedirect(resp['redirects'] + [canonical])
+    shortlink = header_links.get('shortlink')
+
+    # use tika to extract text
+    doc = parser.from_buffer(content)
+    # logger.debug(pformat(doc))
+    if doc.get('status') != 200:
+        msg = f'Analyzing document failed: {durl.url()}'
+        return ResourceError(msg)
+
+    # collect meta data
+    meta = doc.get('metadata', {})
+    content_type = meta.get('Content-Type')
+    if isinstance(content_type, list):
+        content_type = content_type[-1]
+    title = concat(meta.get('title'))
+    concat(meta.get('creator'))
+    last_change = extract_latest(meta.get('date') or meta.get('created'))
+    keywords = None
+
+    # text content
+    text = (doc.get('content') or '').strip()
+
+    # links
+    links_int: dict[Durl, tuple[list[str], str]] = {}
+    links_ext: dict[Durl, tuple[list[str], str]] = {}
+    for url in re_url.findall(text):
+        link_durl = await Durl(url[0])
+        if link_durl:
+            if link_durl.site() == durl.site():
+                links_int[link_durl] = [], link_durl.url()
+            else:
+                links_ext[link_durl] = [], link_durl.url()
+
+    # annotations
+    text, annotations = annotate_text(text)
+
+    return TextResource(
+        content_type=content_type,
+        last_change=last_change,
+        text_len=len(text),
+        lang=extract_content_language(text),
+        title=title,
+        init_fields={
+            'durl': durl,
+            'site': site,
+            'headers': resp['headers'],
+            'redirects': resp['redirects'],
+            'links_int': links_int,
+            'links_ext': links_ext,
+            'shortlink': shortlink,
+            'canonical': None,
+        },
+        search_fields={
+            'title': title,
+            'pub_date': last_change,
+            'keywords': keywords,
+            'text': text,
+            'annotations': annotations,
+        },
+    )
+
+
+def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
+    """
+    Extract the lastest date (if any) from a string or list of strings.
+    """
+    if not s:
+        return None
+    if not isinstance(s, list):
+        s = [s]
+    dt = []
+    for t in s:
+        try:
+            dt.append(datetime.fromisoformat(t.rstrip('Z')))
+        except:
+            pass
+    return max(dt) if dt else None
+
+
+def concat(s: Optional[Union[str, list]]) -> Optional[str]:
+    """
+    Helper function for joining strings together.
+    """
+    if not s:
+        return None
+    if not isinstance(s, list):
+        s = [s]
+    return ' '.join(s)
--- a/src/atextcrawler/resource/feed.py
+++ b/src/atextcrawler/resource/feed.py
@ -0,0 +1,155 @@
+"""
+Stuff related to feeds.
+
+Higher-level stuff is in site.feeds.
+"""
+
+import logging
+from datetime import datetime, timezone
+from typing import Optional, Union
+
+from asyncpg import Connection
+from feedparser import parse
+
+from ..models import Feed, MetaResource, ResourceError
+from ..utils.durl import Durl
+
+logger = logging.getLogger(__name__)
+
+
+feed_types = (
+    'application/rss+xml',
+    'application/atom+xml',
+    'application/feed+json',
+)
+
+
+async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
+    """
+    Fetch, parse and return a given feed's content. Also update *feed*.
+
+    If the server replied with HTTP 410, delete the feed.
+    If there is no new information (server replied with HTTP 304),
+    return None. For other errors also return None and increase the
+    fail_count.
+    """
+    headers = {'Cache-control': 'max-age=600'}
+    if feed.modified:
+        headers['If-Modified-Since'] = feed.modified
+    elif feed.etag:
+        headers['If-None-Match'] = feed.etag.removeprefix('W/')
+    resource = await fetcher.fetch(feed.url, headers=headers)
+    if isinstance(resource, ResourceError):
+        if resource.status == 410:
+            msg = f'Feed has vanished, deleting it: {feed}'
+            logger.debug(msg)
+            await feed.delete(conn)
+        if resource.status != 304:
+            feed.fail_count += 1
+            if feed.fail_count > 5:
+                msg = f'Feed not reachable, deleting it: {feed}'
+                logger.debug(msg)
+                await feed.delete(conn)
+        return None  # HTTP 304, no new entries
+    elif isinstance(resource, Feed):
+        resource.id_ = feed.id_
+        resource.site_id = feed.site_id
+        await resource.save(conn)
+        return resource.entries
+    else:
+        return None
+
+
+def parse_json_feed(resp, data: dict) -> Feed:
+    """
+    Parse a JSON response for jsonfeed information.
+
+    TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
+    """
+    feed = Feed()
+    feed.url = data.get('feed_url', resp['redirects'][-1])
+    feed.etag = resp['headers'].get('ETag')
+    feed.modified = resp['headers'].get('Last-Modified')
+    feed.t_visit = datetime.utcnow()
+    version = data.get('version', '')
+    version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
+    feed.version = version[:10]
+    feed.title = data.get('title')
+    feed.description = data.get('description')
+    feed.fail_count = 0
+    entries = []
+    latest = None
+    # parse feed entries to a dict compatible with feedparser's entries
+    for feed_item in data.get('items', []):
+        entry = {}
+        entry['link'] = feed_item.get('url')
+        dt = feed_item.get('date_published')
+        if dt:
+            dt = datetime.fromisoformat(dt) if dt else None
+            dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
+            entry['published_parsed'] = dt.timetuple()
+        entry['title'] = feed_item.get('title')
+        entry['summary'] = feed_item.get('summary')
+        entries.append(entry)
+        if dt:
+            latest = max(latest or dt, dt)
+    feed.entries = entries
+    feed.t_content = latest
+    return feed
+
+
+def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
+    """
+    Parse a response from Fetcher.get_resp() for xml feed information.
+    """
+    feed = Feed()
+    feed.url = resp['redirects'][-1]
+    feed.etag = resp['headers'].get('ETag')
+    feed.modified = resp['headers'].get('Last-Modified')
+    feed.t_visit = datetime.utcnow()
+    try:
+        parsed = parse(resp['content'], response_headers=resp['headers'])
+    except Exception as error:
+        return ResourceError(f'Feedparser error: {error}')
+    latest = parsed['feed'].get('updated_parsed')
+    if latest:
+        latest = datetime(*latest[:6])
+        feed.t_content = max(feed.t_content or latest, latest)
+    feed.version = parsed['version']
+    feed.title = parsed['feed'].get('title', '')[:200] or None
+    feed.description = parsed['feed'].get('description')
+    feed.fail_count = 0
+    feed.entries = parsed['entries']
+    return feed
+
+
+def convert_feed_entries(
+    base_url: Optional[str],
+    entries: list[dict],
+) -> tuple[
+    list[tuple[str, bool]],
+    dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
+]:
+    """
+    Extract paths and resource meta information from a feed's entries.
+
+    Return paths in a structure wanted by :func:`add_site_paths` and
+    resource meta information in a structure wanted by
+    :func:`update_resource_meta`.
+    """
+    paths = []
+    resource_meta = {}
+    for entry in entries:
+        if entry.get('link') and entry['link'].startswith(base_url or ''):
+            path = entry['link'].removeprefix(base_url or '').lstrip('/')
+            if len(path) <= 200:
+                last_update = entry.get('published_parsed')
+                if last_update:
+                    last_update = datetime(*last_update[:6])
+                paths.append((path, True))
+                resource_meta[path] = (
+                    last_update,
+                    entry.get('title', '')[:200] or None,
+                    entry.get('summary', '')[:2000] or None,
+                )
+    return paths, resource_meta
--- a/src/atextcrawler/resource/fetch.py
+++ b/src/atextcrawler/resource/fetch.py
@ -0,0 +1,327 @@
+"""
+Access to a resource specified by a URL.
+"""
+
+import gzip
+import logging
+from json import loads
+from traceback import format_exc
+from typing import Any, Optional, Union
+
+import aiohttp
+from bs4 import BeautifulSoup
+
+from ..models import (
+    Feed,
+    MetaResource,
+    ResourceError,
+    ResourceRedirect,
+    Site,
+    TextResource,
+)
+from ..utils.durl import Durl
+from ..utils.link import in_blacklist
+from .document import parse_document
+from .feed import parse_json_feed, parse_xml_feed
+from .page import parse_html
+from .plaintext import parse_plaintext
+from .sitemap import parse_sitemap, parse_sitemapindex
+
+logger = logging.getLogger(__name__)
+
+
+MAX_REDIRECTS = 10
+"""
+Maximum number of redirects to follow.
+"""
+
+
+default_headers = {
+    'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
+    ' Gecko/20100101 Firefox/78.0',
+    'DNT': '1',
+    'Upgrade-Insecure-Requests': '1',
+    'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
+}
+"""
+Default HTTP client headers, overwriting those of aiohttp.ClientSession.
+"""
+
+
+blacklist_content_types = [
+    '',
+    'application/ogg',
+]
+"""
+Blacklist for content-types.
+"""
+
+
+text_content_types = {
+    'text/html': 'html',
+    'text/plain': 'plain',
+    'application/rss+xml': 'feed-rss',
+    'application/atom+xml': 'feed-atom',
+    'application/feed+json': 'feed-json',
+    'application/json': 'json',
+    'application/xml': 'xml',
+    'text/xml': 'xml',
+}
+"""
+Map content-types to parsers.
+"""
+
+
+class ResourceFetcher:
+    """
+    Fetch a resource specified by a URL (:meth:`fetch`).
+
+    The timeout is the same for all requests.
+    """
+
+    def __init__(
+        self,
+        session: aiohttp.ClientSession,
+        timeout_sock_connect: Union[int, float] = 8,
+        timeout_sock_read: Union[int, float] = 30,
+    ):
+        self.session = session
+        self.timeout = aiohttp.ClientTimeout(
+            sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
+        )
+
+    async def fetch(
+        self,
+        url: str,
+        site: Optional[Site] = None,
+        redirect_history: Optional[list[str]] = None,
+        headers: Optional[dict] = None,
+    ) -> Union[
+        None, MetaResource, TextResource, ResourceError, ResourceRedirect
+    ]:
+        """
+        Try to fetch a resource and return an instance or error or redirect.
+
+        If an error was encountered, return a ResourceError.
+        If the resource has an irrelevant content type, return None.
+        Otherwise return a specific content instance.
+
+        Argument *redirect_history* contains the redirect history;
+        if one of the redirects is encountered again, return None.
+        """
+        if redirect_history is None:
+            redirect_history = []
+        if not (durl := await Durl(url)):
+            return ResourceError('Invalid URL')
+        resp = await self.get_resp(
+            durl,
+            redirect_history=redirect_history,
+            headers=headers,
+        )
+        if isinstance(resp, ResourceError):
+            return resp
+        if resp is None:
+            return None
+        result = await self._parse(durl, site, resp)
+        if isinstance(result, (MetaResource, TextResource)):
+            result.id_ = None
+        return result
+
+    async def _parse(
+        self, durl, site, resp, in_recursion=False
+    ) -> Union[
+        None, MetaResource, TextResource, ResourceError, ResourceRedirect
+    ]:
+        """
+        Parse a response. May call itself.
+        """
+        result: Union[
+            None, MetaResource, TextResource, ResourceError, ResourceRedirect
+        ] = None
+        content = resp['content']
+        if isinstance(content, str) and content.startswith('<?xml '):
+            result = await parse_xml(durl, resp)
+        elif resp['parser'] == 'feed-rss':
+            result = await parse_xml(durl, resp, rss=True)
+        elif resp['parser'] == 'feed-atom':
+            result = await parse_xml(durl, resp, atom=True)
+        elif resp['parser'] == 'xml':
+            result = await parse_xml(durl, resp)
+        elif resp['parser'] == 'html':
+            result = await parse_html(durl, resp, site)
+        elif resp['parser'] in ('json', 'feed-json'):
+            result = await parse_json(durl, resp)
+        elif resp['parser'] == 'plain':
+            result = await parse_plaintext(durl, resp, site)
+        elif resp['parser'] == 'application':
+            if resp['headers'].get('content-type') == 'application/x-gzip':
+                if in_recursion:
+                    return None  # consider nested gzip an attack
+                resp['content'] = gzip.decompress(resp['content'])
+                return await self._parse(durl, site, resp, in_recursion=True)
+            result = await parse_document(durl, resp, site)
+        if isinstance(result, ResourceRedirect):
+            redir_url = result.urls[-1]
+            result = await self.fetch(
+                redir_url,
+                site=site,
+                redirect_history=result.urls[:-1],
+            )
+        return result
+
+    async def get_resp(
+        self,
+        durl: Durl,
+        headers: dict = None,
+        redirect_history: Optional[list[str]] = None,
+    ) -> Optional[Union[ResourceError, dict]]:
+        """
+        Try to fetch a url returning a ResourceError or a dict with content.
+
+        Optional *headers* will overwrite the :var:`default_headers`.
+
+        If the response status is not 200, always return an ResourceError.
+
+        If the content-type is not relevant (see blacklist_content_types),
+        return None.
+
+        The dict contains these keys+values:
+
+          * 'parser': a hint on the parser to use for analyzing the content;
+             one of 'html', 'plain', 'feed', 'xml', 'application'
+          * 'content': bytes for type application, otherwise str
+          * 'redirects': a list of URLs visited during HTTP redirection,
+                         the last item is the final URL
+          * 'headers': response headers
+        """
+        if redirect_history is None:
+            redirect_history = []
+        if len(redirect_history) >= MAX_REDIRECTS:
+            return None
+        headers_ = default_headers.copy()
+        if headers:
+            headers_.update(headers)
+        try:
+            async with self.session.get(
+                durl.url(),
+                headers=headers_,
+                timeout=self.timeout,
+            ) as resp:
+                redirects = [durl.url()]
+                if resp.history:
+                    href = resp.history[-1].headers.get('location')
+                    if not href or not (redurl := await Durl(href, base=durl)):
+                        msg = 'Invalid URL after HTTP redirect'
+                        return ResourceError(msg)
+                    if in_blacklist(redurl.hostname):
+                        src_url = (
+                            redirect_history[0]
+                            if redirect_history
+                            else durl.url()
+                        )
+                        msg = (
+                            f'Dropping URL {src_url}, since'
+                            f' redirected to a blacklisted site'
+                        )
+                        logger.debug(msg)
+                        return None
+                    redirects = [str(r.url) for r in resp.history]
+                    redirects.append(redurl.url())
+                if join := set(redirect_history) & set(redirects):
+                    msg = f'Cyclic redirect {join}'
+                    return ResourceError(msg)
+                if resp.status != 200:
+                    msg = f'HTTP status {resp.status}'
+                    return ResourceError(
+                        msg, status=resp.status, headers=headers
+                    )
+                c_type = resp.headers.get('content-type', '').split(';')[0]
+                if c_type in blacklist_content_types:
+                    return None
+                result: dict[str, Any] = {
+                    'redirects': redirect_history + redirects,
+                    'headers': resp.headers,
+                }
+                if c_type in text_content_types.keys():
+                    try:  # catch decoding issues
+                        content = await resp.text()
+                    except:
+                        body = await resp.read()
+                        encoding = resp.charset or 'utf-8'
+                        encoding = encoding.replace('CP-1250', 'cp1250')
+                        content = body.decode(encoding, errors='replace')
+                    result['content'] = content
+                    result['parser'] = text_content_types[c_type]
+                    return result
+                elif c_type.startswith('application/'):
+                    result['content'] = await resp.read()
+                    result['parser'] = 'application'
+                    return result
+        except aiohttp.ClientError as error:
+            # on certificate error try without tls
+            if 'SSLCertVerificationError' in str(error):
+                if durl.scheme == 'https':
+                    url = durl.url()
+                    durl.replace_scheme('http')
+                    response = await self.get_resp(
+                        durl=durl,
+                        headers=headers,
+                        redirect_history=redirect_history + [url],
+                    )
+                    if not isinstance(response, ResourceError):
+                        return response
+            msg = f'ClientError: {error}'
+            return ResourceError(msg)
+        except Exception as error:
+            msg = f'Unknown error: {error}:\n{format_exc()}'
+            logger.error(msg)
+            return ResourceError(msg)
+        return None
+
+
+async def parse_xml(
+    durl: Durl,
+    response: dict,
+    rss=False,
+    atom=False,
+) -> Optional[Union[MetaResource, ResourceError]]:
+    """
+    Parse XML content.
+
+    In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
+    """
+    try:
+        xml = response['content']
+        soup = BeautifulSoup(xml, 'html.parser')
+    except:
+        return None
+    if rss or (rss := soup.find('rss')):
+        return parse_xml_feed(response)
+    elif atom or (atom := soup.find('atom')):
+        return parse_xml_feed(response)
+    elif sitemapindex := soup.find('sitemapindex'):
+        return parse_sitemapindex(sitemapindex)
+    elif urlset := soup.find('urlset'):
+        return parse_sitemap(urlset)
+    else:
+        return None
+
+
+async def parse_json(
+    durl: Durl,
+    response: dict,
+) -> Optional[Union[Feed, ResourceError]]:
+    """
+    Parse the content of JSON feeds.
+    """
+    try:
+        data = loads(response['content'])
+    except:
+        msg = f'Could not parse JSON from {durl.url()}'
+        logger.debug(msg)
+        return None
+    if not isinstance(data, dict):
+        return None
+    if data.get('version', '').startswith('https://jsonfeed.org/'):
+        return parse_json_feed(response, data)
+    return None
--- a/src/atextcrawler/resource/operations.py
+++ b/src/atextcrawler/resource/operations.py
@ -0,0 +1,347 @@
+"""
+Operations on resources.
+"""
+
+import logging
+from datetime import datetime
+from typing import Optional, Sequence
+
+from asyncpg import Connection
+
+from ..models import (
+    Feed,
+    MetaResource,
+    ResourceError,
+    Site,
+    Sitemap,
+    SitemapIndex,
+    SitePath,
+    TextResource,
+)
+from ..search import delete_resource, index_resource
+from ..tensorflow import TensorFlow
+from ..utils.durl import Durl
+from ..utils.similarity import (
+    create_simhash,
+    search_simhash,
+    simhash_from_bigint,
+    simhash_to_bigint,
+)
+from .feed import convert_feed_entries
+from .fetch import ResourceFetcher
+from .sitemap import extract_sitemap_paths
+
+logger = logging.getLogger(__name__)
+
+
+async def add_site_paths(
+    conn: Connection,
+    site_id: int,
+    paths: Sequence[tuple[str, Optional[bool]]],
+) -> None:
+    """
+    Add site paths. if resource infos are given, also create resources.
+
+    The paths must be given as relative paths and together with a boolean
+    telling whether the link is a canonical link.
+    """
+    sql = (
+        "INSERT INTO site_path (site_id, path, canonical)"
+        " VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING"
+    )
+    values = (
+        (site_id, path, canonical)
+        for path, canonical in paths[:100000]
+        if len(path) <= 400
+    )
+    await conn.executemany(sql, values)
+
+
+async def update_resource_meta(
+    conn: Connection,
+    site_id: int,
+    resource_meta: dict,
+) -> None:
+    """
+    Update meta information of existing resources using path to find them.
+    """
+    sql = (
+        "UPDATE resource SET last_change=coalesce($1, last_change),"
+        " title=coalesce($2, title), summary=coalesce($3, summary) FROM ("
+        " SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5"
+        ") sp WHERE resource.id=sp.resource_id"
+    )
+    values = ((*meta, site_id, path) for path, meta in resource_meta.items())
+    await conn.executemany(sql, values)
+
+
+async def store_feed_entries(
+    conn: Connection,
+    site: Site,
+    entries: list[dict],
+) -> None:
+    """
+    Add missing resources of a site from given feed entries.
+    """
+    if site.id_:
+        paths, resource_meta = convert_feed_entries(site.base_url, entries)
+        await add_site_paths(conn, site.id_, paths)
+        await update_resource_meta(conn, site.id_, resource_meta)
+
+
+async def get_site_path(
+    conn: Connection,
+    site: Site,
+    before: datetime,
+    only_new=False,
+) -> Optional[SitePath]:
+    """
+    Return the next path of a given site that needs to be processed.
+
+    If none needs to be processed, return None.
+
+    Only return paths that have last been visited before *before*
+    or not been processed at all. Paths with a ok_count of -3 or lower
+    are dropped.
+
+    If *only_new*, limit to paths that have not been processed at all,
+    irrespective of the value of *before*.
+    """
+    if only_new:
+        sql = (
+            "SELECT * FROM site_path"
+            " WHERE site_id=$1 AND last_visit is null LIMIT 1"
+        )  # implicitly canonical=null
+        row = await conn.fetchrow(sql, site.id_)
+    else:
+        sql = (
+            "SELECT * FROM site_path"
+            " WHERE site_id=$1 AND canonical IS NOT false AND"
+            " (last_visit is null OR last_visit<$2) AND"
+            " ok_count > -3 LIMIT 1"
+        )  # canonical can be true or null
+        row = await conn.fetchrow(sql, site.id_, before)
+    if row:
+        return await SitePath().load_from_row(row)
+    return None
+
+
+async def process_site_path(
+    app,
+    worker_number: int,
+    conn: Connection,
+    fetcher: ResourceFetcher,
+    tf: TensorFlow,
+    site: Site,
+    site_path: SitePath,
+) -> bool:
+    """
+    Fetch a path, deduplicate and if canonical, update and index the resource.
+
+    Return whether a new resource was handled that should contribute be
+    statistics.
+    """
+    msg = (
+        f'Worker {worker_number} processing site {site.id_}'
+        f' site_path {site_path.id_} {site.base_url}{site_path.path}'
+    )
+    logger.debug(msg)
+    if not site.id_:  # only to satisfy typing
+        return False
+
+    # fetch url
+    site_path.last_visit = datetime.utcnow()
+    url = site_path.url(site)
+    resource = await fetcher.fetch(url, site=site)
+
+    # handle failure (possibly deleting old information)
+    if not isinstance(resource, (TextResource, MetaResource)):
+        if not resource:  # irrelevant content-type
+            site_path.ok_count = -10
+        elif isinstance(resource, ResourceError):
+            site_path.ok_count -= 1
+        if site_path.ok_count <= -3 and site_path.resource_id:
+            await site_path.unlink_resource(
+                conn,
+                app.search_engine,
+                app.config['elasticsearch']['index_base_name'],
+            )
+        await site_path.save(conn)
+        if resource:  # relevant content-type
+            msg = (
+                f'Worker {worker_number} failed to process site_path'
+                f' {site_path.id_} (site {site.id_},'
+                f' {site.base_url}{site_path.path})'
+            )
+            logger.info(msg)
+        return False
+
+    # handle MetaResources
+    if isinstance(resource, MetaResource):
+        if isinstance(resource, Feed):
+            resource.site_id = site.id_
+            await resource.save(conn)
+            if resource.entries:
+                await store_feed_entries(conn, site, resource.entries)
+        elif isinstance(resource, Sitemap):
+            paths, _ = extract_sitemap_paths(site.base_url, resource.urls)
+            await add_site_paths(conn, site.id_, paths)
+        elif isinstance(resource, SitemapIndex):
+            for sitemap_dict in resource.sitemaps:
+                url = sitemap_dict['loc']
+                res_sitemap = await fetcher.fetch(url, site=site)
+                if isinstance(res_sitemap, Sitemap):
+                    paths, _ = extract_sitemap_paths(
+                        site.base_url, res_sitemap.urls
+                    )
+                    await add_site_paths(conn, site.id_, paths)
+        return False
+
+    # handle TextResource
+    relevant, is_new_resource = await _handle_text_resource(
+        app, conn, tf, site, site_path, resource, url
+    )
+    if not relevant:
+        return False
+    site_path.resource_id = resource.id_
+    site_path.canonical = resource.init_fields.get('canonical')
+    site_path.ok_count += 1
+    await site_path.save(conn)
+
+    if shortlink_url := resource.init_fields.get('shortlink'):
+        await _save_shortlink(
+            conn, site, url, resource, shortlink_url, site_path.last_visit
+        )
+
+    return is_new_resource
+
+
+async def _handle_text_resource(
+    app, conn, tf, site, site_path, resource, url
+) -> tuple[bool, bool]:
+    """
+    Ingest a text resource.
+
+    Return whether the resource is relevant and whether it is new.
+    """
+    # save the resource's internal links
+    paths = []
+    if links_int := resource.init_fields['links_int']:
+        for durl, (rel, _) in links_int.items():
+            rp_filter = app.plugins['filter_resource_path'].rp_filter
+            if path := rp_filter(site, durl):
+                canon = (rel and rel.lower() == 'canonical') or None
+                paths.append((path, canon))
+        await add_site_paths(conn, site.id_, paths)
+
+    # find resources similar to the current text
+    text = resource.search_fields['text']
+    if len(text) < 300:  # discard resources with too short texts
+        site_path.resource_id = None
+        await site_path.save(conn)
+        return False, False
+    simhash = simhash_from_bigint(resource.simhash)
+    index = site.simhash_index
+    similar_ids = search_simhash(index, simhash)
+
+    # determine the destination resource and resources to be merged into it
+    old_id = site_path.resource_id
+    if (
+        old_id
+        and old_id in similar_ids
+        and (  # similar to old text
+            dest_resource := await TextResource().load(conn, old_id)
+        )
+    ):
+        merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
+    else:  # no old text, or old text not similar any more
+        if old_id:
+            await site_path.unlink_resource(
+                conn,
+                app.search_engine,
+                app.config['elasticsearch']['index_base_name'],
+            )
+        # find the first existing similar resource
+        for similar_id in similar_ids:
+            dest_resource = await TextResource().load(conn, similar_id)
+            if dest_resource:
+                # also require similar length
+                l1 = len(resource.search_fields['text'])
+                l2 = dest_resource.text_len
+                if 0.95 * l2 <= l1 <= 1.05 * l2:
+                    merge_ids = list(
+                        filter(lambda elem: elem != similar_id, similar_ids)
+                    )
+                    break
+        else:
+            dest_resource = None
+            merge_ids = []
+
+    # update or create the destination resource
+    if dest_resource:
+        is_new_resource = False
+        resource.simhash = create_simhash(index, dest_resource.id_, simhash)
+        await dest_resource.update_from_resource(resource)
+        resource = dest_resource
+    else:
+        is_new_resource = True
+        resource.simhash = simhash_to_bigint(simhash)
+        await resource.save(conn)
+        create_simhash(index, resource.id_, simhash)
+
+    # add resource to search index
+    if resource.content_type in ('html', 'plain'):
+        await index_resource(
+            app.search_engine,
+            tf,
+            site_path,
+            resource,
+            site.base_url,
+            url,
+        )
+
+    # merge resources: merge_ids -> resource
+    for merge_id in merge_ids:
+        # replace links to the merge resource with links to the dest resource
+        sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
+        await conn.execute(sql, resource.id_ or None, merge_id)
+        # remove orphaned merge resource
+        sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
+        found = await conn.fetchval(sql, merge_id)
+        if found:
+            await delete_resource(
+                app.search_engine,
+                found[1],
+                merge_id,
+            )
+
+    return True, is_new_resource
+
+
+async def _save_shortlink(
+    conn, site, url, resource, shortlink_url, last_visit
+):
+    """
+    Save a shortlink.
+    """
+    shortlink_durl = await Durl(shortlink_url, base=site.base_url)
+    if shortlink_durl and shortlink_url != url:
+        sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
+        sl_path = shortlink_durl.pwa()
+        row = await conn.fetchrow(sql, site.id_, sl_path)
+        shortlink = await SitePath().load_from_row(row)
+        if not shortlink:
+            shortlink = SitePath(
+                site_id=site.id_,
+                path=sl_path,
+                last_visit=last_visit,
+                ok_count=1,
+                canonical=False,
+                resource_id=resource.id_,
+            )
+        else:
+            shortlink.last_visit = last_visit
+            shortlink.ok_count += 1
+            shortlink.canonical = False
+            shortlink.resource_id = resource.id_
+        await shortlink.save(conn)
--- a/src/atextcrawler/resource/page.py
+++ b/src/atextcrawler/resource/page.py
@ -0,0 +1,355 @@
+"""
+Parse HTML pages.
+"""
+
+import logging
+from copy import deepcopy
+from typing import Optional, Union
+
+from bs4 import BeautifulSoup
+from tidylib import tidy_document
+
+from ..models import ResourceError, ResourceRedirect, Site, TextResource
+from ..utils.annotation import (
+    annotate,
+    annotations_remove_section,
+    clean_annotations,
+    get_tag_counts,
+    headline_probability,
+)
+from ..utils.date_finder import extract_latest_date
+from ..utils.durl import Durl, assort_links
+from ..utils.html import (
+    clean_body,
+    clean_page,
+    extract_title,
+    get_html_lang,
+    get_html_redirect,
+)
+from ..utils.http import get_header_links
+from ..utils.lang import extract_content_language
+from ..utils.section import iter_sections
+from ..utils.tag import keep_tags
+
+logger = logging.getLogger(__name__)
+logger_debug = logging.getLogger(__name__ + '.debug')
+logger_debug.setLevel(logging.INFO)
+logger_links = logging.getLogger(__name__ + '.debug.links')
+logger_stats = logging.getLogger(__name__ + '.debug.stats')
+logger_sections = logging.getLogger(__name__ + '.debug.sections')
+
+
+async def parse_html(
+    durl: Durl,
+    resp: dict,
+    site: Optional[Site],
+) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
+    """
+    Extract relevant data from a response returning a TextResource instance.
+
+    The given URL must be the full URL (incl. scheme and netloc) of the page.
+    """
+    html = resp['content']
+
+    # follow link to canonical URL
+    header_links = await get_header_links(resp['headers'], durl, site)
+    if canonical := header_links.get('canonical'):
+        if canonical != durl.url():
+            return ResourceRedirect(resp['redirects'] + [canonical])
+
+    # follow html redirect, if present
+    if redir_url := get_html_redirect(html):
+        if redir_url not in resp['redirects']:
+            return ResourceRedirect(resp['redirects'] + [redir_url])
+        else:
+            msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
+            return ResourceError(msg)
+
+    # require html tag
+    if not html[:14].lower().startswith('<!doctype html'):
+        if '<html' not in html:
+            return None
+
+    # real URL after redirection
+    url = resp['redirects'][-1]
+    durl = await Durl(url)
+    if not durl:
+        return None
+
+    # page title
+    title = extract_title(html)
+
+    # tidy html
+    try:
+        html, _ = tidy_document(
+            html.encode('utf-8'),
+            options={
+                'logical-emphasis': 1,
+                'merge-divs': 1,
+                'merge-spans': 1,
+                'hide-comments': 1,
+                'output-bom': 0,
+                'show-errors': 0,
+            },
+        )
+        html = html.decode('utf-8')
+    except:
+        msg = f'Cannot tidy html from {url}'
+        return ResourceError(msg)
+
+    # drop irrelevant tags, including their contents
+    soup = clean_page(html)
+
+    # extract shortlink (from http headers or html head)
+    shortlink = header_links.get('shortlink')
+    if not shortlink and soup.head:
+        for link in soup.head.find_all('link'):
+            if 'shortlink' in link.get('rel', ''):
+                if link.get('href'):
+                    shortlink = link.get('href')
+                    break
+
+    # language, plaintext, annotations, last change
+    lang = get_html_lang(html)
+    html = clean_body(str(soup.body))
+    head = soup.head
+    text, annotations = annotate(html)
+    if lng := extract_content_language(text):
+        lang = lng
+    last_change = extract_latest_date(html, lang=lang)
+
+    # assort internal and external links
+    base_url = None
+    if head and head.base:
+        base_url = head.base.get('href')
+    if not base_url and site:
+        base_url = site.base_url
+    cleaned_links, links_int, links_ext = await assort_links(
+        annotations['links'], durl, text, base_url
+    )
+    annotations['links'] = cleaned_links
+    if logger_links.isEnabledFor(logging.DEBUG):
+        logger_links.debug('==== internal links')
+        for durl_, txt in links_int.items():
+            logger_links.debug(f'{durl_.url()} {txt}')
+        logger_links.debug('==== external links')
+        for durl_, txt in links_ext.items():
+            logger_links.debug(f'{durl_.url()} {txt}')
+
+    # keywords from category links
+    category_links = set()
+    for href, (i, f, rel) in annotations['links'].items():
+        if rel and ('category' in rel or 'tag' in rel):
+            category_links.add(text[i:f])
+    keywords = sorted(category_links)
+
+    # filter out irrelevant sections
+    filtered_text, filtered_ann = filter_sections(
+        text, annotations, site.boilerplate_texts if site else None
+    )
+
+    # debug statistics
+    if logger_stats.isEnabledFor(logging.DEBUG):
+        sb = annotations['semantic_breaks']
+        fsb = filtered_ann['semantic_breaks']
+        logger_stats.debug(
+            f'Page statistics:'
+            f' html_len={len(html)} text_len={len(filtered_text)}'
+            f' ratio={len(filtered_text) / len(html):.2f};'
+            f' sections={len(sb)} filtered_sections={len(fsb)}'
+            f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
+        )
+
+    return TextResource(
+        content_type='html',
+        last_change=last_change,
+        text_len=len(text),
+        lang=lang,
+        title=title,
+        init_fields={
+            'durl': durl,
+            'site': site,
+            'headers': resp['headers'],
+            'redirects': resp['redirects'],
+            'links_int': links_int,
+            'links_ext': links_ext,
+            'shortlink': shortlink,
+            'canonical': True if canonical else None,
+            'head': head,
+        },
+        search_fields={
+            'title': title,
+            'pub_date': last_change,
+            'keywords': keywords,
+            'text': filtered_text,
+            'annotations': filtered_ann,
+            'head': str(head),
+        },
+    )
+
+
+def filter_sections(text, annotations, boilerplate_texts):
+    """
+    Filter out irrelevant sections using scores and factoring in neighbors.
+    """
+    tags = annotations['tags']
+    sb = annotations['semantic_breaks']
+    section_ids = annotations['section_ids']
+
+    # for i1,f1 in sorted(tags.keys()):
+    #    print('           ', i1,f1,tags[(i1,f1)], text[i1:f1])
+    # for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
+    #    print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
+    # print('_' * 50)
+    # from pprint import pprint
+    # pprint(sb)
+    # pprint(tags)
+    # pprint(section_ids)
+
+    # calculate keep scores for sections
+    # negative scores mean: drop; positive scores mean keep;
+    # scores between -2 and 2 are undecided
+    sections_keep = {}
+    headline_probs = {}
+    for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
+        if prob := headline_probability(txt, tags[(i, f)], lvl):
+            headline_probs[(i, f)] = prob
+        w = 0
+        n_chars = f - i - 1
+        # string length
+        w = (n_chars - 80) / 80  # initial weight
+        # punctuation
+        w += 0.4 * text.count('.') + 0.1 * text.count(',')
+        # p tag
+        if 'p' in tags[(i + 1, f)]:  # prefer keeping paragraphs
+            w += 0.7
+        # links
+        n_links, link_density, avg_text_len = get_tag_counts(
+            ('a',), i, f, tags, text
+        )
+        if link_density > 0.5:
+            w = -n_links
+        elif link_density > 0.3 and avg_text_len < 60:
+            w = -3
+        else:
+            n_li, li_density, li_len = get_tag_counts(
+                ('li',), i, f, tags, text
+            )
+            if link_density > 0.2 and li_density > 0.8 and li_len < 50:
+                w = -3
+        if 52 <= lvl < 60:
+            w = max(w, 1.0)
+        if 'sidebar' in ' '.join(section_ids.get(i, [])):
+            w = -3
+        if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
+            w = -3
+        # special chars
+        if txt.startswith('←') or txt.endswith('→'):  # wordpress navigation
+            w = -3
+        # remove boilerplate texts
+        if boilerplate_texts and txt in boilerplate_texts:
+            w = -10
+        sections_keep[(i, f)] = w, lvl
+
+    # amend keep scores: look at preceding / subsequent sections with
+    # equal level and transfer their keep scores to the current section
+    n = len(sections_keep)
+    sections = list(sorted(sections_keep.keys()))
+    # inspect subsequent sections:
+    for rev_ind, s_range in enumerate(reversed(sections)):
+        ind = n - 1 - rev_ind
+        w, lvl = sections_keep[s_range]
+        if abs(w) <= 2:
+            w_sum = 0
+            n_peers = 0
+            for i in range(ind + 1, min(n, ind + 15)):
+                w_, lvl_ = sections_keep[sections[i]]
+                if lvl_ != lvl:
+                    break
+                n_peers += 1
+                w_sum += w_
+            if n_peers >= 3:
+                sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
+    # inspect preceding sections:
+    for ind, s_range in enumerate(sections):
+        w, lvl = sections_keep[s_range]
+        if abs(w) <= 2:
+            w_sum = 0
+            n_peers = 0
+            for i in range(ind - 1, max(0, ind - 15), -1):
+                w_, lvl_ = sections_keep[sections[i]]
+                if lvl_ != lvl:
+                    break
+                n_peers += 1
+                w_sum += w_
+            if n_peers >= 3:
+                sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
+
+    # amend keep scores: look at sections that could be headlines
+    # for subsequent kept sections and increase their score;
+    # also allow for up to 2 sections inbetween (which will also
+    # have their score increased)
+    for rev_ind, s_range in enumerate(reversed(sections)):
+        ind = n - 1 - rev_ind
+        w, lvl = sections_keep[s_range]
+        if abs(w) <= 2:
+            if headline_probs.get(s_range, 0) > 0.49:
+                # look at subsequent sections with higher level
+                child_weights = []
+                for i in range(ind + 1, n):
+                    w_, lvl_ = sections_keep[sections[i]]
+                    if lvl_ <= lvl or w_ < -2:
+                        break
+                    child_weights.append(w_)
+                if nc := len(child_weights):
+                    child_avg = sum(child_weights) / nc
+                    if w + 1.2 * child_avg > 2:
+                        sections_keep[s_range] = w + 1.2 * child_avg, lvl
+                        if nc > 1:
+                            if (w1 := child_weights[0]) <= 2:
+                                sections_keep[sections[ind + 1]] = (
+                                    w1 + 1.5 * child_avg,
+                                    lvl,
+                                )
+                        if nc > 2:
+                            if (w2 := child_weights[1]) <= 2:
+                                sections_keep[sections[ind + 2]] = (
+                                    w2 + 2 * child_avg,
+                                    lvl,
+                                )
+
+    # clean annotations
+    clean_annotations(annotations)
+
+    # debug sections
+    if logger_sections.isEnabledFor(logging.DEBUG):
+        logger_sections.debug('============= Weighted sections =============')
+        for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
+            w, lvl = sections_keep[(i, f)]
+            indent = ('+' if w > 2 else '-') * lvl
+            ts = ','.join(tags[(i + 1, f)])
+            logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
+
+    # narrow down annotations and text to keep_sections
+    # drop undecided sections
+    filtered_text = text
+    filtered_ann = deepcopy(annotations)
+    for i, f in sorted(sections_keep.keys(), reverse=True):
+        w, lvl = sections_keep[(i, f)]
+        if w <= 2.0:
+            filtered_ann = annotations_remove_section(filtered_ann, i, f)
+            filtered_text = filtered_text[:i] + filtered_text[f:]
+    clean_annotations(filtered_ann)
+
+    # debug filtered sections
+    if logger_sections.isEnabledFor(logging.DEBUG):
+        logger_sections.debug('')
+        logger_sections.debug('============= Filtered sections =============')
+        fsb = filtered_ann['semantic_breaks']
+        ftags = filtered_ann['tags']
+        for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
+            indent = ' ' * lvl
+            ts = ','.join(ftags.get((i + 1, f), []))
+            logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
+
+    return filtered_text, filtered_ann
--- a/src/atextcrawler/resource/plaintext.py
+++ b/src/atextcrawler/resource/plaintext.py
@ -0,0 +1,148 @@
+"""
+Parse plaintext pages.
+"""
+
+import logging
+import re
+from typing import Any, Optional, Union
+
+import pypandoc
+
+from ..models import ResourceError, ResourceRedirect, Site, TextResource
+from ..utils.annotation import annotate
+from ..utils.date_finder import extract_latest_date
+from ..utils.durl import Durl
+from ..utils.http import get_header_links
+from ..utils.lang import extract_content_language
+from ..utils.muse import parse_muse
+
+logger = logging.getLogger(__name__)
+
+
+MAX_LINK_TEXT_LENGTH = 100
+"""
+Maximum length of a link's text to be kept.
+
+Cf. table site_link, column link_text.
+"""
+
+
+re_url = re.compile(
+    r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
+    r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
+)
+
+
+re_nl = re.compile(r'\r\n')
+
+
+re_ws = re.compile(r'\s*\n\s*\n\s*')
+
+
+re_nn = re.compile(r'\n\n')
+
+
+async def parse_plaintext(
+    durl: Durl,
+    resp: dict,
+    site: Optional[Site],
+) -> Optional[Union[ResourceRedirect, TextResource]]:
+    """
+    Extract relevant data from a response returning a TextResource instance.
+
+    The given URL must be the full URL (incl. scheme and netloc) of the page.
+    """
+    text = resp['content']
+
+    # HTTP headers, canonical URL, shortlink
+    header_links = await get_header_links(resp['headers'], durl, site)
+    if canonical := header_links.get('canonical'):
+        if canonical != durl.url():
+            return ResourceRedirect(resp['redirects'] + [canonical])
+    shortlink = header_links.get('shortlink')
+
+    if not text:
+        return None
+
+    text = re_nl.sub('\n', text)
+    text = re_ws.sub('\n\n', text)
+
+    # meta info
+    meta: dict[str, Any] = {}
+    muse = None
+    if durl.path.endswith('.muse'):
+        muse = parse_muse(text)
+        if muse:
+            meta, text = muse
+    # title
+    if not meta.get('title'):
+        meta['title'] = text[:200].splitlines()[0]
+    # content language
+    if not meta.get('lang'):
+        meta['lang'] = extract_content_language(text)
+    # publication date
+    if not meta.get('pub_date'):
+        meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
+
+    # links
+    links_int: dict[Durl, tuple[list[str], str]] = {}
+    links_ext: dict[Durl, tuple[list[str], str]] = {}
+    for url in re_url.findall(text):
+        link_durl = await Durl(url[0])
+        if link_durl:
+            if link_durl.site() == durl.site():
+                links_int[link_durl] = [], link_durl.url()
+            else:
+                links_ext[link_durl] = [], link_durl.url()
+
+    if muse:
+        html = pypandoc.convert_text(text, 'html5', format='muse').strip()
+        text, annotations = annotate(html)
+    else:
+        text, annotations = annotate_text(text)
+
+    return TextResource(
+        content_type=resp['parser'],
+        last_change=meta.get('pub_date'),
+        text_len=len(text),
+        lang=meta.get('lang'),
+        title=meta.get('title'),
+        init_fields={
+            'durl': durl,
+            'site': site,
+            'headers': resp['headers'],
+            'redirects': resp['redirects'],
+            'links_int': links_int,
+            'links_ext': links_ext,
+            'shortlink': shortlink,
+            'canonical': None,
+        },
+        search_fields={
+            'title': meta.get('title'),
+            'authors': meta.get('authors'),
+            'pub_date': meta.get('pub_date'),
+            'keywords': meta.get('keywords'),
+            'summary': meta.get('summary'),
+            'text': text,
+            'annotations': annotations,
+        },
+    )
+
+
+def annotate_text(text):
+    """
+    Return annoations as :func:`utils.annotation.annotate`does.
+
+    Here we only have information on semantic breaks
+    (in plaintext they are where empty lines are).
+    """
+    semantic_breaks = {}
+    for match in re_nn.finditer(text):
+        semantic_breaks[match.span()[0]] = ''
+    annotations = {
+        'tags': {},
+        'semantic_breaks': semantic_breaks,
+        'section_ids': {},
+        'links': {},
+    }
+    return text, annotations
--- a/src/atextcrawler/resource/sitemap.py
+++ b/src/atextcrawler/resource/sitemap.py
@ -0,0 +1,149 @@
+"""
+Sitemap and SitemapIndex and related operations.
+"""
+
+import logging
+from datetime import datetime
+from typing import Optional
+
+import pytz
+
+from ..models import Sitemap, SitemapIndex, TextResource
+
+logger = logging.getLogger(__name__)
+
+
+async def get_sitemap_urls(
+    fetcher,
+    base_url: Optional[str],
+    sitemaps=None,
+) -> list[dict]:
+    """
+    Try to find sitemaps and fetch and return their URL content.
+
+    Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
+    """
+    if sitemaps:
+        # test example: https://www.berlin.de/
+        check_all = True
+    elif base_url:
+        sitemaps = [
+            base_url.rstrip('/') + '/sitemap.xml',
+            base_url.rstrip('/') + '/wp-sitemap.xml',
+            base_url.rstrip('/') + '/sitemap_index.xml',
+            base_url.rstrip('/') + '/sitemap.xml.gz',
+            base_url.rstrip('/') + '/sitemap_index.xml.gz',
+            base_url.rstrip('/') + '/sitemap.txt',
+            base_url.rstrip('/') + '/sitemap/',
+            base_url.rstrip('/') + '/sitemap1.xml',
+            base_url.rstrip('/') + '/sitemap-index.xml',
+            base_url.rstrip('/') + '/sitemapindex.xml',
+            base_url.rstrip('/') + '/sitemap/index.xml',
+        ]
+        check_all = False
+    else:
+        return []
+    urls = []
+    for sitemap in sitemaps:
+        resource = await fetcher.fetch(sitemap)
+        found = True
+        if isinstance(resource, SitemapIndex):
+            for sitemap_ in resource.sitemaps:
+                sitemaps.append(sitemap_['loc'])
+        elif isinstance(resource, Sitemap):
+            urls += resource.urls
+        elif isinstance(resource, TextResource) and resource.content_type in (
+            'html',
+            'plain',
+        ):
+            urls += [
+                {'loc': durl.url()}
+                for durl in resource.init_fields['links_int']
+            ]
+        else:
+            found = False
+        if found and not check_all:
+            break
+    return urls
+
+
+def parse_sitemapindex(sitemapindex):
+    """
+    Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
+    """
+    sitemaps = []
+    for tag in sitemapindex.find_all('sitemap'):
+        if loc := tag.find('loc'):
+            if loc.string:
+                sitemap = {'loc': loc.string.strip()}
+                if lastmod := tag.find('lastmod'):
+                    try:
+                        t = datetime.fromisoformat(lastmod.string.strip())
+                        sitemap['lastmod'] = t
+                    except:
+                        pass
+                sitemaps.append(sitemap)
+    return SitemapIndex(sitemaps=sitemaps)
+
+
+def parse_sitemap(urlset) -> Sitemap:
+    """
+    Return a list of sitemap URLs.
+
+    Each URL is a dict with these keys+values:
+
+      * loc: the full URL of a mapped resource
+      * lastmod: optional datetime of its last modification
+      * changefreq: optional info on the change frequency to be expected
+      * priority: optional info on its priority relative to other resources
+
+    Cf. https://www.sitemaps.org/protocol.html
+    """
+    urls = []
+    for tag in urlset.find_all('url'):
+        if loc := tag.find('loc'):
+            if loc.string:
+                url = {'loc': loc.string.strip()}
+                if lastmod := tag.find('lastmod'):
+                    try:
+                        t = lastmod.string.strip().rstrip('Z')
+                        url['lastmod'] = (
+                            datetime.fromisoformat(t)
+                            .astimezone(pytz.utc)
+                            .replace(tzinfo=None)
+                        )
+                    except:
+                        pass
+                if changefreq := tag.find('changefreq'):
+                    url['changefreq'] = changefreq.string.strip()
+                if priority := tag.find('priority'):
+                    url['priority'] = priority.string.strip()
+                urls.append(url)
+    return Sitemap(urls=urls)
+
+
+def extract_sitemap_paths(
+    base_url: Optional[str],
+    urls: list[dict],
+) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
+    """
+    Extract essential information from sitemap URLs.
+
+    Return a list of relative paths of the site's resources
+    (in a form to be easily fed into `add_site_paths`) and
+    the datetime of the latest change.
+
+    Relative paths are computed using base_url.
+    """
+    paths = []
+    latest = None
+    for url in urls:
+        loc = url['loc']
+        lastmod = url.get('lastmod')
+        if loc.startswith(base_url or ''):
+            path = loc.removeprefix(base_url or '').lstrip('/')
+            path = path.split('#', 1)[0]
+            paths.append((path, True))
+            if lastmod:
+                latest = max(lastmod, latest or lastmod)
+    return paths, latest
--- a/src/atextcrawler/search/init.py
+++ b/src/atextcrawler/search/init.py
@ -0,0 +1,6 @@
+from .engine import (
+    delete_resource,
+    index_resource,
+    shutdown_engine,
+    startup_engine,
+)
--- a/src/atextcrawler/search/engine.py
+++ b/src/atextcrawler/search/engine.py
@ -0,0 +1,270 @@
+"""
+Search engine, for now elasticsearch.
+
+We have one index per supported language and a default one.
+"""
+
+import logging
+import warnings
+from difflib import SequenceMatcher
+from typing import Union
+
+from elasticsearch import AsyncElasticsearch
+from elasticsearch.exceptions import NotFoundError
+
+from ..utils.annotation import pack_annotations
+from ..utils.section import concat_section_texts
+
+logger = logging.getLogger(__name__)
+
+
+warnings.filterwarnings(
+    'ignore',
+    'The client is unable to verify that the'
+    ' server is Elasticsearch due security privileges on the server side',
+)
+
+
+MIN_INDEXING_TIMEOUT_SECONDS = 5
+
+
+language_analyzers = {
+    'en': 'english',
+    'de': 'german',
+    #'fr': 'french',
+    #'el': 'greek',
+    #'es': 'spanish',
+    'default': 'standard',
+}
+
+
+properties = {
+    'resource_id': {'type': 'long'},
+    'site_id': {'type': 'long'},
+    'url': {'type': 'text'},
+    'base_url': {'type': 'text'},
+    'pub_date': {'type': 'date', 'format': 'yyyy-MM-dd||yyyy-MM||yyyy'},
+    'lang': {'type': 'keyword'},
+    'title': {'type': 'text'},
+    'authors': {'type': 'text'},
+    'summary': {'type': 'text'},
+    'keywords': {'type': 'text'},
+    'collections': {'type': 'keyword'},
+    'time_horizon': {'type': 'keyword'},
+    'orig_source': {'type': 'text'},
+    'topics': {'type': 'text'},
+    'annotations': {'type': 'text', 'index': False},
+    'sections': {
+        'type': 'nested',
+        'properties': {
+            'start_ids': {'type': 'integer'},
+            'end_ids': {'type': 'integer'},
+            'text': {'type': 'text', 'index_options': 'offsets'},
+            'embedding': {'type': 'dense_vector', 'dims': 512},
+        },
+    },
+}
+
+
+async def startup_engine(config):
+    """
+    Open the search engine for access.
+    """
+    engine = AsyncElasticsearch(
+        host=config['elasticsearch']['host'],
+        api_key=(
+            config['elasticsearch']['id'],
+            config['elasticsearch']['api_key'],
+        ),
+        use_ssl=False,
+        timeout=20,
+    )
+    engine.index_base_name = config['elasticsearch']['index_base_name']
+    await create_indices(engine)
+    await open_indices(engine)
+    return engine
+
+
+async def create_indices(engine):
+    """
+    Create indices for all configured langiages.
+    """
+    for lang, analyzer in language_analyzers.items():
+        index_name = engine.index_base_name + '_text_' + lang
+        if not await engine.indices.exists(index=index_name):
+            await engine.indices.create(index=index_name)
+        await engine.indices.close(index=index_name)
+        await engine.indices.put_settings(
+            index=index_name,
+            body={
+                'analysis': {'analyzer': {'default': {'type': analyzer}}},
+                'refresh_interval': '60s',
+            },
+        )
+        await engine.indices.put_mapping(
+            index=index_name,
+            body={'properties': properties},
+        )
+
+
+async def open_indices(engine):
+    """
+    Open indices for all configure languages.
+    """
+    for lang in language_analyzers.keys():
+        index_name = engine.index_base_name + '_text_' + lang
+        await engine.indices.open(index=index_name)
+
+
+async def shutdown_engine(engine):
+    """
+    Close the connection to the search engine.
+    """
+    # await close_indices(engine)
+    await engine.close()
+
+
+async def close_indices(engine):
+    """
+    Close indices. UNUSED.
+    """
+    for lang in language_analyzers.keys():
+        index_name = engine.index_base_name + '_text_' + lang
+        await engine.indices.close(index=index_name)
+
+
+async def index_resource(
+    engine,
+    tf,
+    site_path,
+    resource,
+    base_url,
+    url,
+):
+    """
+    Index a resource.
+    """
+    lang = resource.lang
+    index_lang = lang if lang in language_analyzers.keys() else 'default'
+    index_name = engine.index_base_name + '_text_' + index_lang
+    pub_date = resource.search_fields.get('pub_date')
+    if pub_date:
+        pub_date = str(pub_date.date())
+    text = resource.search_fields.get('text')
+    annotations = resource.search_fields.get('annotations')
+    semantic_breaks = annotations['semantic_breaks']
+    sections = []
+    for section_ids, txt in concat_section_texts(text, semantic_breaks):
+        embedding = await tf.embed(txt)
+        sections.append(
+            {
+                'start_ids': section_ids[0],
+                'end_ids': section_ids[-1],
+                'text': txt,
+                'embedding': embedding,
+            }
+        )
+    doc = {
+        'resource_id': resource.id_,
+        'site_id': site_path.site_id,
+        'url': url,
+        'base_url': base_url,
+        'pub_date': pub_date,
+        'lang': resource.lang,
+        'title': resource.search_fields.get('title'),
+        'authors': resource.search_fields.get('authors'),
+        'summary': resource.search_fields.get('summary'),
+        'keywords': resource.search_fields.get('keywords'),
+        'collections': resource.search_fields.get('collections'),
+        'time_horizon': resource.search_fields.get('time_horizon'),
+        'orig_source': resource.search_fields.get('orig_source'),
+        'topics': resource.search_fields.get('topics'),
+        'annotations': pack_annotations(annotations),
+        'sections': sections,
+    }
+    timeout_seconds = max(MIN_INDEXING_TIMEOUT_SECONDS, int(len(text) / 1000))
+    await engine.index(
+        id=resource.id_,
+        index=index_name,
+        body=doc,
+        timeout=f'{timeout_seconds}s',
+    )
+
+
+async def delete_resource(engine, lang, resource_id):
+    """
+    Delete a resource.
+    """
+    index_name = engine.index_base_name + '_text_' + (lang or 'default')
+    try:
+        await engine.delete(index_name, resource_id)
+    except NotFoundError:
+        msg = f'Cannot delete resource from index, not found: {resource_id}'
+        logger.warning(msg)
+
+
+async def find_duplicate(engine, site_id, resource) -> Union[bool, None, int]:
+    """
+    UNUSED.
+
+    Try to find a duplicate resource with matching site.
+
+    If the search backend query fails, return False.
+    If no matching resource was found, return None.
+    If a matching resource was found, return its id.
+    """
+    # get sample texts
+    text = resource.search_fields['text']
+    if not text or len(text) < 100:
+        return None
+    #  annotations = resource.search_fields['annotations']
+    #  semantic_breaks = annotations['semantic_breaks']
+    #  texts = []
+    #  for _, txt in concat_section_texts(text, semantic_breaks):
+    #      texts.append(txt)
+    #  texts = extract_samples(texts)
+
+    #  # search for sample texts
+    #  text_count = len(texts)
+    #  should_min = max(1, int(0.6 * text_count))
+    #  should = []
+    #  for text in texts:
+    #      should.append({'match': {'sections.text': text}})
+    query = {
+        'bool': {
+            'must': {
+                'nested': {
+                    'path': 'sections',
+                    'query': {'match': {'sections.text': text}},
+                },
+            },
+            'filter': {
+                'term': {
+                    'site_id': site_id,
+                },
+            },
+        }
+    }
+    fields = [
+        'url',
+        'sections.text',
+        'site_id',
+    ]
+    response = await engine.search(
+        index=engine.index_base_name + '_text_*',
+        body={
+            'query': query,
+            'fields': fields,
+            'from': 0,
+            'size': 3,
+            '_source': False,
+        },
+    )
+    if response['timed_out']:
+        return False
+    for hit in response.get('hits', {}).get('hits'):
+        txt = ' '.join(hit['fields']['sections.text'])
+        similarity = SequenceMatcher(None, text, txt).ratio()
+        if similarity > 0.99:
+            return hit['_id']
+    return None
--- a/src/atextcrawler/site/init.py
+++ b/src/atextcrawler/site/init.py
@ -0,0 +1,9 @@
+"""
+Websites.
+"""
+
+from .feeds import fetch_feeds
+from .operations import checkin_site, checkout_site, process_site, update_site
+from .queue import process_site_queue
+from .robots import RobotsInfo
+from .seed import load_seeds
--- a/src/atextcrawler/site/main.py
+++ b/src/atextcrawler/site/main.py
@ -0,0 +1,68 @@
+"""
+Tool for analyzing a website.
+
+Fetch the startpage and output information to console.
+Do not change any persistent data.
+"""
+
+import asyncio
+import logging
+import sys
+
+import aiohttp
+
+from ..models import TextResource
+from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
+from ..site.robots import RobotsInfo
+from ..utils.durl import Durl
+from .parse import parse_startpage
+
+logger = logging.getLogger()
+logger.setLevel(logging.WARNING)
+logger.addHandler(logging.StreamHandler())
+
+
+async def run():
+    """
+    Fetch the startpage of a website and show information about it.
+
+    The URL must be given as commandline argument.
+    """
+    base_url = sys.argv[1]
+    async with aiohttp.ClientSession() as session:
+        if not (base_durl := await Durl(base_url)):
+            return
+        fetcher = ResourceFetcher(session)
+        resource = await fetcher.fetch(base_url)
+        logger.warning(repr(resource))
+        if (
+            isinstance(resource, TextResource)
+            and resource.content_type == 'html'
+        ):
+            site = await parse_startpage(resource)
+            # site.crawl_enabled = await site_filter(site)
+            logger.warning(repr(site))
+            logger.warning('')
+            for durl, text in site.links_ext.items():
+                logger.warning(f'                  {durl} {text}')
+                logger.warning(f'{durl.url()} -------- {text}')
+            logger.warning('')
+            logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
+            logger.warning('')
+            robots = await RobotsInfo(base_url)
+            urls = await get_sitemap_urls(
+                fetcher, base_url, sitemaps=robots.site_maps
+            )
+            paths, latest = extract_sitemap_paths(base_url, urls)
+            for path in paths:
+                logger.warning(path)
+            logger.warning(f'Feeds: {site.feeds}')
+            logger.warning(latest)
+            # sample_links = extract_samples(resource.init_fields['links_int'])
+            # logger.warning(f'************* {sample_links}')
+        else:
+            logger.warning('(No text resource or error.)')
+
+
+if __name__ == '__main__':
+    asyncio.run(run())
--- a/src/atextcrawler/site/feeds.py
+++ b/src/atextcrawler/site/feeds.py
@ -0,0 +1,100 @@
+"""
+High-level feed-related stuff.
+
+See resource.feed for low-level stuff not primarily related to sites.
+"""
+
+from datetime import datetime
+from typing import Optional
+
+from ..models import Feed
+from ..resource import store_feed_entries, update_feed
+
+
+async def store_new_feeds(conn, site_id, feeds: dict):
+    """
+    Store new feeds in table site_feed.
+    """
+    sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1"
+    known_feeds = (await conn.fetchval(sql, site_id)) or []
+    for feed_url in feeds.keys():
+        if feed_url not in known_feeds:
+            feed = Feed(
+                site_id=site_id,
+                url=feed_url,
+            )
+            await feed.save(conn)
+
+
+async def get_feeds(conn, site_id) -> list[Feed]:
+    """
+    Return stored feeds for the given site.
+    """
+    sql = "SELECT * FROM site_feed WHERE site_id=$1"
+    rows = (await conn.fetch(sql, site_id)) or []
+    return [(await Feed().load_from_row(row)) for row in rows]
+
+
+async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]:
+    """
+    Fetch feeds, add new resources and return the latest content update time.
+    """
+    feeds = await get_feeds(conn, site.id_)
+    latest = None
+    for feed in feeds:
+        feed_content = await update_feed(fetcher, feed, conn)
+        if feed_content:
+            await store_feed_entries(conn, site, feed_content)
+            if feed.t_content:
+                latest = max(latest or feed.t_content, feed.t_content)
+    return latest
+
+
+if __name__ == '__main__':
+    # only use this on a dev instance!
+    import asyncio
+    import logging
+    import sys
+
+    import aiohttp
+
+    from ..config import Config
+    from ..db import PGPool
+    from ..resource.fetch import ResourceFetcher
+    from .operations import process_site, update_site
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+    config = Config().get()
+    url = sys.argv[1]
+
+    async def run():
+        """
+        Fetch and display a site.
+        """
+        app = None  # TODO
+        async with PGPool(config['postgresql']) as pool:
+            async with pool.acquire() as conn:
+                async with aiohttp.ClientSession() as session:
+                    fetcher = ResourceFetcher(session)
+                    site, _ = await update_site(app, fetcher, conn, url)
+                    logger.warning(site)
+                    await process_site(fetcher, conn, site)
+                    latest = await fetch_feeds(fetcher, conn, site)
+                    logger.warning(f'latest: {latest}')
+                    # feed = Feed(url=url)
+                    # feed_content = await update_feed(fetcher, feed, conn)
+                    # if isinstance(feed_content, ResourceError):
+                    #    print(feed_content)
+                    # else:
+                    #    print(feed)
+                    #    pprint(feed_content[0])
+                    #    print('---- 2nd try ----')
+                    #    feed_content = await update_feed(fetcher, feed, conn)
+                    #    if isinstance(feed_content, ResourceError):
+                    #        print(feed_content)
+                    #    else:
+                    #        print(feed)
+                    #        pprint(feed_content[0])
+
+    asyncio.run(run())
--- a/src/atextcrawler/site/operations.py
+++ b/src/atextcrawler/site/operations.py
@ -0,0 +1,267 @@
+"""
+Operations on sites.
+"""
+
+import logging
+from datetime import datetime, timedelta
+from typing import Optional
+
+from asyncpg import Connection
+
+from ..models import Crawl, Site, TextResource
+from ..resource import (
+    add_site_paths,
+    extract_sitemap_paths,
+    get_sitemap_urls,
+    store_boilerplate_texts,
+)
+from ..utils.durl import Durl
+from ..utils.similarity import get_simhash_index
+from .feeds import fetch_feeds, store_new_feeds
+from .parse import parse_startpage
+from .robots import RobotsInfo
+
+logger = logging.getLogger(__name__)
+
+
+async def checkout_site(
+    app, conn: Connection
+) -> tuple[Optional[int], bool, bool]:
+    """
+    Get the id of a site to be crawled and mark it with crawl_active=true.
+
+    Also return whether the site shall be fully crawled; if not, this
+    means that just the resources from the feeds shall be crawled.
+
+    Also return whether more sites might be available.
+    """
+    async with conn.transaction():
+        sql = (
+            "SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
+            " FROM site WHERE crawl_enabled AND crawl_active = false"
+            " AND (next_full_crawl < now() at time zone 'UTC'"
+            " OR next_feed_crawl < now() at time zone 'UTC')"
+            " LIMIT 1 FOR UPDATE SKIP LOCKED"
+        )
+        row = await conn.fetchrow(sql)
+        if row:
+            site_id = row['id']
+            is_full = row['is_full']
+            sql = "UPDATE site SET crawl_active = true WHERE id=$1"
+            await conn.execute(sql, site_id)
+            site = await Site().load(conn, site_id)
+            if site:
+                site.base_durl = await Durl(site.base_url)
+                if site.base_durl:
+                    site.simhash_index = await get_simhash_index(conn, site_id)
+                    return site, is_full, True
+                else:
+                    # site not available; schedule next crawl
+                    int_full = app.config['crawl']['full_crawl_interval']
+                    int_feed = app.config['crawl']['feed_crawl_interval']
+                    now = datetime.utcnow()
+                    t_full = now + timedelta(seconds=int_full)
+                    t_feed = now + timedelta(seconds=int_full + int_feed)
+                    sql = (
+                        "UPDATE site SET crawl_active=false,"
+                        " next_full_crawl=$1, next_feed_crawl=$2"
+                        " WHERE id=$3"
+                    )
+                    await conn.execute(sql, t_full, t_feed, site_id)
+                    return None, False, True
+            return None, False, True
+    return None, False, False
+
+
+async def update_site(
+    app, fetcher, conn: Connection, base_url, site: Site = None
+) -> tuple[Optional[Site], bool]:
+    """
+    Try to fetch base_url and return a site and whether a new one was created.
+
+    This function is run for all sites (including blacklisted and irrelevant
+    ones). It determines whether the site shall be crawled.
+
+    If an errors occurs, return (None, False), and if a site was given,
+    also set it to crawl_enabled=False and remove crawling schedules.
+
+    If base_url could be fetched, update the site, possibly creating
+    a new one.
+
+    If the site has crawl_enabled, and no full crawl is scheduled,
+    schedule one (by updating column `next_full_crawl`).
+    """
+    # fetch startpage
+    logger.info(f'Updating site={site}, base_url={base_url}')
+    resource = await fetcher.fetch(base_url, site=site)
+    if (
+        not isinstance(resource, TextResource)
+        or resource.content_type != 'html'
+    ):
+        if site:
+            site.meta_info['error'] = 'Invalid start page'
+            site.crawl_enabled = False
+            site.next_full_crawl = None
+            site.next_feed_crawl = None
+            await site.save(conn)
+        logger.info(f'Failed startpage {base_url}: {resource}')
+        return None, False
+
+    # parse startpage (extract site information) and save the site
+    site = await parse_startpage(resource, app=app, site=site)
+    site_id, created = await site.save(conn)
+    if created:
+        logger.debug(f'Created {site}')
+
+    # add black-/white-listing info
+    is_allowed = await is_site_allowed(conn, site.id_, base_url)
+    if is_allowed is not None and is_allowed != site.crawl_enabled:
+        site.crawl_enabled = is_allowed
+        await site.save(conn)
+
+    # schedule full crawl, if none is scheduled and the site shall be crawled
+    if site.crawl_enabled:
+        sql = (
+            "UPDATE site"
+            " SET next_full_crawl=now() at time zone 'UTC'"
+            " WHERE id=$1 AND next_full_crawl IS null"
+        )
+        await conn.execute(sql, site_id)
+
+    return site, created
+
+
+async def is_site_allowed(
+    conn: Connection,
+    site_id: Optional[int],
+    base_url: str,
+) -> Optional[bool]:
+    """
+    Return True if the site is whitelisted, False if blacklisted, else None.
+
+    Also add missing site_ids to the annotations.
+    """
+    sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
+    anns = await conn.fetch(sql, site_id, base_url)
+    for ann in anns:
+        if ann['ann_type'] == 'blacklist':
+            return False
+        if ann['ann_type'] == 'whitelist':
+            return True
+    # add missing site_ids
+    if site_id and any([ann['site_id'] is None for ann in anns]):
+        sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
+        await conn.execute(sql, site_id, base_url)
+    return None
+
+
+async def process_site(fetcher, conn: Connection, site: Site):
+    """
+    Process a site: fetch and store more information.
+
+    Store external and internal links; find boilerplate texts;
+    fetch sitemaps; fetch feeds; update date of last publication.
+    """
+    if not site.id_:  # only to satisfy typing
+        return
+    if site.links_ext:
+        await _store_cross_site_links(conn, site.id_, site.links_ext)
+    if site.links_int:
+        paths = []
+        for durl, (rel, _) in site.links_int.items():
+            canon = (rel and rel.lower() == 'canonical') or None
+            paths.append((durl.pwa(), canon))
+        await add_site_paths(conn, site.id_, paths)
+
+    await store_boilerplate_texts(fetcher, conn, site)
+
+    # get sitemaps and add their resources
+    robots = await RobotsInfo(site.base_url)  # type: ignore
+    urls = await get_sitemap_urls(
+        fetcher, site.base_url, sitemaps=robots.site_maps
+    )
+    paths_, latest = extract_sitemap_paths(site.base_url, urls)
+    await add_site_paths(conn, site.id_, paths_)
+
+    # store feeds and their resources
+    await store_new_feeds(conn, site.id_, site.feeds)
+    latest_ = await fetch_feeds(fetcher, conn, site)
+    if latest_:
+        latest = max(latest or latest_, latest_)
+
+    # update last_pub
+    if latest:
+        site.last_pub = latest
+    await site.save(conn)
+
+
+async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
+    """
+    Unlock the site and schedule next crawl.
+
+    *crawl* is the crawl that has just finished (regularly or stopped).
+
+    If the crawl was stopped (t_end is None), just unlock the site.
+
+    Otherwise schedule a crawl of the same type. After a full crawl
+    also a feed crawl is scheduled, if there was none scheduled.
+    """
+    if crawl.t_end is None:
+        sql = "UPDATE site SET crawl_active=false WHERE id=$1"
+        await conn.execute(sql, site.id_)
+    elif crawl.is_full:
+        full_interval = app.config['crawl']['full_crawl_interval']
+        feed_interval = app.config['crawl']['feed_crawl_interval']
+        next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
+        next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
+        sql = (
+            "UPDATE site SET crawl_active=false, next_full_crawl=$1,"
+            " next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
+        )
+        await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
+    else:
+        feed_interval = app.config['crawl']['feed_crawl_interval']
+        next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
+        sql = (
+            "UPDATE site SET crawl_active=false, next_feed_crawl=$1"
+            " WHERE id=$2"
+        )
+        await conn.execute(sql, next_feed_crawl, site.id_)
+
+
+async def _store_cross_site_links(
+    conn: Connection,
+    site_id: int,
+    links: dict[Durl, tuple[list[str], str]],
+) -> None:
+    """
+    Put outgoing links into site_link/site_queue for existing/unknown sites.
+
+    Separate outgoing links from *site_id* into two classes:
+    (a) existing sites (rows in table site) and (b) unknown links.
+    Add links from class (a) to table site_link.
+    Add links from class (b) to table site_queue.
+    """
+    # add outgoing cross-site links for existing sites to table site_link
+    urls = [url.site() for url in links.keys()]
+    values = []
+    sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
+    if rows := await conn.fetch(sql, urls):
+        for row in rows:
+            if (durl := await Durl(row['url'])) in links.keys():
+                _, link_text = links.pop(durl)
+                if site_id != row['id']:
+                    values.append((site_id, row['id'], link_text))
+    sql = (
+        "INSERT INTO site_link (src, dst, link_text)"
+        " VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
+    )
+    await conn.executemany(sql, values)
+
+    # add outgoing cross-site links for unknown sites to table site_queue
+    sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
+    values = [
+        (site_id, durl.site()[:200], link_text[:100])
+        for durl, (_, link_text) in links.items()
+    ]
+    await conn.executemany(sql, values)
--- a/src/atextcrawler/site/parse.py
+++ b/src/atextcrawler/site/parse.py
@ -0,0 +1,255 @@
+"""
+Parsing of a site's startpage.
+"""
+
+import re
+from datetime import datetime
+from typing import Any, Optional
+
+from ..models import Site, TextResource
+from ..resource import feed_types
+from ..utils.durl import Durl, get_ips
+from ..utils.html import clean_html
+from ..utils.lang import clean_lang
+from ..utils.link import (
+    extract_domain,
+    in_blacklist,
+    link_rels,
+    meta_names,
+    meta_props,
+)
+
+re_meta_keyword_sep = re.compile('[,;\r\n]')
+
+
+def cut_str(s: Optional[str], l: int) -> Optional[str]:
+    """
+    Cut a string *s* to a maximal length *l* from the left.
+    """
+    return s[:l] if s else None
+
+
+async def parse_startpage(
+    startpage: TextResource, app=None, site=None
+) -> Site:
+    """
+    Parse a site's startpage and return a Site instance.
+
+    If a site instance is given, update it.
+    """
+    durl = startpage.init_fields['durl']
+    soup = startpage.init_fields['head']
+    meta = collect_meta_tags(soup)
+    meta_links = await collect_meta_links(soup, durl)
+    links_ext = await collect_external_links(startpage, meta_links)
+    links_int = startpage.init_fields['links_int']
+    langs = extract_languages(startpage, meta, meta_links)
+    title, description, keywords = extract_meta_texts(startpage, meta)
+
+    # feeds
+    feeds = meta_links['feeds']
+    if 'wordpress' in meta.get('generator', '').lower():
+        url = durl.site() + 'feed/'
+        feeds[url] = 'application/rss+xml'
+    # TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
+
+    # network params (canonical_url, base_urls, domains)
+    ips = await get_ips(durl.hostname)
+    redirects = []
+    for redirect in startpage.init_fields['redirects']:
+        redir_url = await Durl(redirect)
+        if redir_url:
+            redirects.append(redir_url.site())
+    base_urls = redirects + [durl.url()]
+    domains = [extract_domain(durl.hostname)]
+
+    if site:  # update an existing Site
+        site.canonical_url = meta_links['canonical_url'] or site.canonical_url
+        site.base_urls = base_urls
+        site.domains = domains
+        site.ips = ips
+        site.last_update = datetime.utcnow()
+        site.last_pub = startpage.last_change
+        site.langs = langs
+        site.alt_langs = meta_links['alt_langs']
+        site.title = title
+        site.description = description
+        site.keywords = keywords
+        site.linkbacks.update(meta_links['linkbacks'])
+        site.meta_info = meta
+        site.__post_init__(
+            base_durl=durl,
+            feeds=feeds,
+            links_ext=links_ext,
+            links_int=links_int,
+            startpage_text=startpage.search_fields['text'],
+        )
+    else:  # create new Site instance
+        site = Site(
+            # post_init fields
+            base_durl=durl,
+            feeds=feeds,
+            links_ext=links_ext,
+            links_int=links_int,
+            startpage_text=startpage.search_fields['text'],
+            # dataclass fields
+            canonical_url=meta_links['canonical_url'],
+            base_urls=base_urls,
+            domains=domains,
+            ips=ips,
+            last_update=datetime.utcnow(),
+            last_pub=startpage.last_change,
+            langs=list(langs),
+            alt_langs=meta_links['alt_langs'],
+            title=title,
+            description=description,
+            keywords=keywords,
+            linkbacks=meta_links['linkbacks'],
+            meta_info=meta,
+        )
+    if site.ips is None and site.url:
+        site.ips = await get_ips(site.url.hostname)
+    if app and site.startpage_text:
+        site_filter = app.plugins['filter_site'].site_filter
+        site.crawl_enabled = await site_filter(site)
+    return site
+
+
+def collect_meta_tags(soup):
+    """
+    Collect selected meta tags (meta_names and meta_props) with their values.
+    """
+    meta = {}
+    for tag in soup.find_all('meta'):
+        if (name := tag.get('name')) and name in meta_names:
+            meta[name] = tag.get('content')
+        if (property := tag.get('property')) in meta_props:
+            if content := tag.get('content'):
+                meta[property] = content
+        if tag.get('http-equiv') == 'content-language':  # old html
+            if content := tag.get('content'):
+                meta['http_equiv_lang'] = content
+    return meta
+
+
+async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
+    """
+    Collect link tags with site scope (feeds, linkbacks, canonical, ...).
+    """
+    linkbacks = {}
+    feeds = {}
+    alt_langs = {}
+    canonical_url = None
+    for tag in soup.find_all('link'):
+        if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
+            continue
+        if not (url := tag.get('href')):
+            continue
+        if not (link_durl := await Durl(url, base=base_durl)):
+            continue
+        if in_blacklist(link_durl.hostname):
+            continue
+        link_url = link_durl.url()
+        link_type = tag.get('type')
+        if link_type in feed_types:
+            feeds[link_url] = link_type
+        elif 'canonical' in rels:
+            canonical_url = link_url
+        elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
+            if lang := clean_lang(hreflang):
+                alt_langs[lang] = link_durl.url()
+        elif 'webmention' in rels:
+            linkbacks[link_url] = 'webmention'
+        elif 'pingback' in rels:
+            linkbacks[link_url] = 'pingback'
+    if canonical_url:
+        if canonical_durl := await Durl(canonical_url):
+            canonical_url = canonical_durl.site()
+        else:
+            canonical_url = None
+    return {
+        'feeds': feeds,
+        'linkbacks': linkbacks,
+        'alt_langs': alt_langs,
+        'canonical_url': canonical_url,
+    }
+
+
+async def collect_external_links(startpage, meta_links) -> dict[str, str]:
+    """
+    Return external links (mapping from URL to link text) from startpage.
+
+    Also add links to alternate language variants of the site.
+    """
+    external_links = startpage.init_fields['links_ext'].copy()
+    netloc = startpage.init_fields['durl'].netloc
+    for lang, lang_url in meta_links['alt_langs'].items():
+        if netloc not in lang_url:
+            durl = await Durl(lang_url)
+            if durl:
+                external_links[durl] = f'Alternate language: {lang}'
+    return external_links
+
+
+def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
+    """
+    Extract and return title, description, keywords from a page and meta tags.
+    """
+    title = meta.get('og:site_name')
+    if not title:
+        title = page.search_fields['title'] or ''
+        if meta_title := meta.pop('title', None):
+            if meta_title.lower() not in title.lower():
+                title += ('; ' if title else '') + meta_title
+    title = cut_str(clean_html(title), 200)
+    description = cut_str(clean_html(meta.pop('description', None)), 2000)
+    if meta_keywords := meta.pop('keywords', None):
+        kws = re_meta_keyword_sep.split(meta_keywords)
+        keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
+        if len(keywords) < 2:
+            keywords = [
+                kw.strip()[:50]
+                for kw in meta_keywords.split(' ')
+                if kw.strip()
+            ]
+    else:
+        keywords = []
+    return title, description, keywords
+
+
+def extract_languages(page, meta, meta_links) -> set[str]:
+    """
+    Extract languages from a page's html tag, meta tags and HTTP headers.
+
+    Also add the language detected in the text content of the page.
+
+    Return a set of ISO 639-1 language codes.
+
+    See also https://www.w3.org/International/questions/qa-http-and-lang and
+    https://www.w3.org/International/questions/qa-html-language-declarations
+    """
+    languages = set()
+    if lang := clean_lang(page.lang):
+        languages.add(lang)
+    if lang := clean_lang(meta.get('http_equiv_lang')):
+        languages.add(lang)
+    if lang := clean_lang(meta.get('dc.language')):
+        languages.add(lang)
+    if lang := clean_lang(meta.get('og:locale')):
+        languages.add(lang)
+    for lang, lang_url in meta_links['alt_langs'].items():
+        if page.init_fields['durl'].netloc in lang_url:
+            if lng := clean_lang(lang):
+                languages.add(lng)
+    lngs = (
+        page.init_fields['headers']
+        .get('Content-Language', '')
+        .lower()
+        .replace(' ', '')
+        .split(',')
+    )
+    for lng in lngs:
+        if lang := clean_lang(lng):
+            languages.add(lang)
+    languages.add(page.lang)
+    return languages
--- a/src/atextcrawler/site/queue.py
+++ b/src/atextcrawler/site/queue.py
@ -0,0 +1,127 @@
+"""
+Queue of sites.
+
+When processing a resource, its external links are put into database table
+`site_queue`.
+The items in `site_queue` are processed in :func:`process_site_queue`.
+This is done baseURL by baseURL (see :func:`iter_site_queue`).
+While doing this, cross-site links are put into table `site_link`.
+"""
+
+import logging
+from typing import AsyncIterator, Optional
+
+import aiohttp
+from asyncpg import Connection
+
+from ..resource import ResourceFetcher
+from .operations import update_site
+
+logger = logging.getLogger(__name__)
+
+
+async def process_site_queue(app, pool):
+    """
+    Loop over queued sites creating new sites and adding cross-site links.
+    """
+    site_delay = app.config['crawl']['site_delay']
+    resource_delay = app.config['crawl']['resource_delay']
+    async with pool.acquire() as conn:
+        async with aiohttp.ClientSession() as session:
+            fetcher = ResourceFetcher(session)
+            while app.running:
+                async for base_url, links_from in iter_site_queue(app, conn):
+                    # get or create site
+                    msg = f'Site queue: updating {base_url}'
+                    logger.debug(msg)
+                    site, created = await update_site(
+                        app, fetcher, conn, base_url
+                    )
+                    if site:
+                        await store_incoming_site_site_links(
+                            conn, site.id_, links_from
+                        )
+                    # delete handled queue items
+                    sql = "DELETE FROM site_queue WHERE url=$1"
+                    await conn.execute(sql, base_url)
+                    await app.sleep(resource_delay)
+                logger.debug(
+                    f'Queued sites exhausted, sleeping'
+                    f' for {site_delay} seconds'
+                )
+                await app.sleep(site_delay)
+
+
+async def iter_site_queue(
+    app, conn: Connection
+) -> AsyncIterator[tuple[str, dict[int, str]]]:
+    """
+    Yield URLs with aggregated link information from site_queue.
+
+    Yield a URL and a dict mapping ids of linking sites to link texts.
+    """
+    site_revisit_interval = app.config['crawl']['site_revisit_interval']
+    while app.running:
+        sql = (
+            "SELECT url, array_agg(src) srcs,"
+            " array_agg(link_text) link_texts"
+            " FROM site_queue GROUP BY url LIMIT 1"
+        )
+        row = await conn.fetchrow(sql)
+        if row:
+            base_url = row['url']
+            links_from = {}
+            srcs = row['srcs']
+            link_texts = row['link_texts']
+            for i in range(len(srcs)):
+                if src := srcs[i]:
+                    links_from[src] = link_texts[i]
+            if site_id := await site_recently_updated(
+                conn, base_url, site_revisit_interval
+            ):
+                # just store incoming links and remove the site from the queue
+                await store_incoming_site_site_links(conn, site_id, links_from)
+                sql = "DELETE FROM site_queue WHERE url=$1"
+                await conn.execute(sql, base_url)
+            else:
+                yield base_url, links_from
+        else:
+            break
+
+
+async def site_recently_updated(
+    conn: Connection,
+    base_url: str,
+    site_revisit_interval: float,
+) -> Optional[int]:
+    """
+    Return the id of the site with given base_url if it was updated recently.
+    """
+    sql = (
+        f"SELECT id FROM site WHERE $1=any(base_urls)"
+        f" AND last_update + interval '{site_revisit_interval} seconds'"
+        f" > now() at time zone 'utc' LIMIT 1"
+    )
+    site_id = await conn.fetchval(sql, base_url)
+    return site_id
+
+
+async def store_incoming_site_site_links(
+    conn: Connection, site_id: int, links_from: dict
+):
+    """
+    Store incoming site-site links (irrespective of crawl_enabled).
+
+    *site_id* is the id of the site to which the links in *links_from* point.
+    """
+    sql = (
+        "INSERT INTO site_link"
+        " (src, dst, link_text) VALUES ($1, $2, $3)"
+        " ON CONFLICT (src, dst) DO NOTHING"
+    )
+    values = [
+        (from_id, site_id, link_text)
+        for from_id, link_text in links_from.items()
+        if from_id != site_id
+    ]
+    await conn.executemany(sql, values)
--- a/src/atextcrawler/site/robots.py
+++ b/src/atextcrawler/site/robots.py
@ -0,0 +1,98 @@
+"""
+Fetch and evaluate a website's robots.txt.
+"""
+
+import logging
+from typing import Optional, Union
+from urllib.robotparser import RobotFileParser
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class RobotsInfo(RobotFileParser):
+    """
+    Obtain information from a site's robots.txt.
+
+    After instantiation you must await :meth:`startup`.
+    """
+
+    def __init__(
+        self,
+        site_url: str,
+        user_agent: str = '*',
+        session: aiohttp.ClientSession = None,
+    ):
+        super().__init__()
+        self.__user_agent = user_agent
+        self.__site_url = site_url.rstrip('/')
+        self.__robots_url = self.__site_url + '/robots.txt'
+        self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
+        self.__session = session
+
+    def __await__(self):
+        return self.__ainit__().__await__()
+
+    async def __ainit__(self):
+        if self.__session:
+            content = await self.__get_robots_txt(self.__session)
+        else:
+            async with aiohttp.ClientSession() as session:
+                content = await self.__get_robots_txt(session)
+        self.parse(content.splitlines())
+        self.__delay = self.crawl_delay(self.__user_agent)
+        request_rate = self.request_rate(self.__user_agent)
+        if request_rate:
+            self.__delay = request_rate.seconds / request_rate.requests
+        self.__site_maps = super().site_maps() or []
+        return self
+
+    async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
+        """
+        Fetch and return the robots.txt over http.
+        """
+        try:
+            async with session.get(
+                self.__robots_url, timeout=self.__timeout
+            ) as resp:
+                if resp.status == 200:
+                    try:
+                        content = await resp.text()
+                    except:
+                        body = await resp.read()
+                        content = body.decode(
+                            resp.charset or 'utf-8', errors='ignore'
+                        )
+                else:
+                    content = ''
+        except aiohttp.ClientError:
+            content = ''
+        return content
+
+    @property
+    def user_agent(self) -> str:
+        """
+        The user agent being used.
+        """
+        return self.__user_agent
+
+    @property
+    def delay(self) -> Optional[Union[int, float]]:
+        """
+        The delay to be used between requests.
+        """
+        return self.__delay
+
+    @property
+    def site_maps(self) -> list[str]:
+        """
+        The list of sitemaps of the site.
+        """
+        return self.__site_maps
+
+    def can_fetch_url(self, url: str) -> bool:
+        """
+        Return whether fetching of the given *url* is allowed.
+        """
+        return super().can_fetch(self.__user_agent, url)
--- a/src/atextcrawler/site/seed.py
+++ b/src/atextcrawler/site/seed.py
@ -0,0 +1,72 @@
+"""
+Seeding of new installations with URLs from blacklists and whitelists.
+"""
+
+from pathlib import Path
+
+import asyncpg
+
+from ..utils.durl import Durl
+
+
+async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
+    """
+    Add seed file contents (site blacklist and whitelist).
+
+    If there are sites already, do nothing.
+    """
+    async with pool.acquire() as conn:
+        site_count = await conn.fetchval("SELECT count(*) FROM site")
+        if site_count:
+            return
+
+        # add blacklist entries
+        values = []
+        blacklist = _load_list(config['config_dir'], 'black')
+        for base_url in blacklist:
+            durl = await Durl(base_url)
+            if durl:
+                url = durl.site()
+                values.append((url, {'source': 'seed file'}))
+        sql = (
+            "INSERT INTO site_annotation (base_url, ann_type, ann_content)"
+            " VALUES ($1, 'blacklist', $2)"
+        )
+        await conn.executemany(sql, values)
+
+        # add whitelist entries
+        values1 = []
+        values2 = []
+        whitelist = _load_list(config['config_dir'], 'white')
+        for base_url in whitelist:
+            durl = await Durl(base_url)
+            if durl:
+                url = durl.site()
+                if url not in blacklist:
+                    values1.append((url, {'source': 'seed file'}))
+                    values2.append((url,))
+        sql = (
+            "INSERT INTO site_annotation (base_url, ann_type, ann_content)"
+            " VALUES ($1, 'whitelist', $2)"
+        )
+        await conn.executemany(sql, values1)
+        sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
+        await conn.executemany(sql, values2)
+
+
+def _load_list(config_dir, black_white):
+    """
+    Load the seed black or white list.
+    """
+    path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
+    with open(path, 'r') as list_file:
+        urls = []
+        for line in list_file.read().strip().splitlines():
+            line_ = line.strip()
+            if line_.startswith('#'):
+                continue
+            if black_white == 'black' and line_.startswith('-'):
+                urls.append(line_[1:].strip())
+            if black_white == 'white' and line_.startswith('+'):
+                urls.append(line_[1:].strip())
+    return urls
--- a/src/atextcrawler/tensorflow.py
+++ b/src/atextcrawler/tensorflow.py
@ -0,0 +1,69 @@
+"""
+Query the tensorflow_model_server's REST API.
+"""
+
+import logging
+from typing import Optional, Union
+
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class TensorFlow:
+    """
+    Fetch an embedding vector from the tensorflow model server.
+    """
+
+    def __init__(
+        self,
+        app,
+        session: aiohttp.ClientSession,
+        timeout_sock_connect: Union[int, float] = 0.5,
+        timeout_sock_read: Union[int, float] = 10,
+    ):
+        self.config = app.config['tensorflow']
+        self.session = session
+        self.timeout = aiohttp.ClientTimeout(
+            sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
+        )
+
+    async def embed(
+        self, text: Union[str, list[str]]
+    ) -> Optional[Union[list[float], list[list[float]]]]:
+        """
+        Query the tensorflow_model_server's REST API for a prediction.
+
+        Take a string or a list of strings and return an embedding vector
+        or a list of embedding vectors.
+
+        If the request fails or times out, return None.
+        """
+        text_ = text if isinstance(text, list) else [text]
+        data = {'signature_name': 'serving_default', 'instances': text_}
+        try:
+            async with self.session.post(
+                self.config['model_server_endpoint'],
+                json=data,
+                timeout=self.timeout,
+            ) as resp:
+                try:
+                    res = await resp.json()
+                    if isinstance(text, list):
+                        return res.get('predictions')
+                    else:
+                        return res.get('predictions')[0]
+                except:
+                    msg = 'Got invalid response from tensorflow'
+                    logger.error(msg)
+                    return None
+        except Exception as err:
+            msg = 'Could not get embedding from tensorflow for '
+            if isinstance(text, str):
+                msg += f'string of length {len(text)}'
+            else:
+                msg += 'list of strings with lengths '
+                msg += ','.join([str(len(s)) for s in text])
+            msg += f', reason: {err}'
+            logger.error(msg)
+            return None
--- a/src/atextcrawler/utils/init.py
+++ b/src/atextcrawler/utils/init.py
--- a/src/atextcrawler/utils/annotation.py
+++ b/src/atextcrawler/utils/annotation.py
@ -0,0 +1,481 @@
+"""
+Convert html to plain text with annotations over character ranges.
+"""
+
+import re
+from collections import defaultdict
+from html.parser import HTMLParser
+
+from .json import json_dumps, json_loads
+from .link import nofollow_link_rels
+from .tag import keep_tags, self_closing_tags
+
+MAX_HREF_LENGTH = 200
+"""
+Maximum length of an href. Other links are discarded.
+"""
+
+
+text_blacklist = [
+    'previous',
+    'next',
+    'back',  # common pagination navigation
+    '↩︎',  # amusewiki footnote separator (after conversion from muse to html)
+]
+"""
+Texts to ignore.
+"""
+
+
+class AnnotatingParser(HTMLParser):
+    """
+    Parse tagged text resulting in pure text and annotations.
+
+    The text is available in self.text and the annotations
+    in self.annotations, which is a dict with these keys:
+
+      * tags: contains a mapping of offset ranges (i, f) to
+        the tags opening at i and closing at f
+      * semantic_breaks: a mapping of offset positions where
+        a new section begins to the nesting level of that
+        sections; a section is whereever an (opening or closing)
+        separating tag is placed in the raw html; for the
+        separating flag of tags see tag.py
+      * links: a mapping of hrefs to link texts obtained from
+        anchor (a) tags; we skip hyperref with nofollow rels
+      * section_ids: map an offset position to the first
+        id attribute (of any tag) at the beginning of a
+        semantic section; this can later be used in a URL
+        fragment for linking directly into this section
+
+    Internally, we put opening tags on self.stack and pop them
+    when the first matching closing tag is encountered. We assume
+    balanced tags (tidy html).
+
+    NB: all tags with semantic breaks have sep=True, i.e.,
+    they will have spaces around them so that the semantic breaks
+    always sit on a space; the semantic break position p is the end
+    of the last section and the next sections begins at p + 1.
+
+    The text alway begins with a ' ' (added if not in the original),
+    which is assigned a semantic break with default level 80
+    (if there is no semantic break tag at the beginning).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.text = ' '  # concatenated text data (without tags)
+        self.pos = 1  # equal to len(self.text)
+        self.stack = []
+        self.tags = defaultdict(dict)
+        self.semantic_breaks = {0: 80}
+        self.tag_id = None
+        self.section_ids = defaultdict(list)
+        self.links = {}
+        self.add_space = False
+
+    def close(self):
+        """
+        Finish by collecting results in dict `self.annotations`.
+        """
+        super().close()
+        self.annotations = {}
+        self.annotations['links'] = self.links
+        self.annotations['semantic_breaks'] = {
+            pos: lvl for pos, lvl in sorted(self.semantic_breaks.items())
+        }
+        self.annotations['tags'] = self.tags
+        self.annotations['section_ids'] = self.section_ids
+
+    def handle_starttag(self, tag, attrs):
+        """
+        Called for each opening tag.
+        """
+        sep, lvl, sem = keep_tags[tag]
+        attrs = dict(attrs)
+        if sep:
+            self.add_space = True
+        if tag == 'section' and 'endnotes' in attrs.get('role', ''):
+            lvl = 25
+        # ARIA roles
+        if role := attrs.get('role'):
+            if role == 'article':
+                lvl = 15
+            elif role == 'heading':
+                if aria_level := attrs.get('aria-level'):
+                    if aria_level in (1, 2, 3, 4, 5, 6):
+                        sep, lvl, sem = keep_tags[f'h{aria_level}']
+            elif role == 'region':
+                lvl = 24
+        i = self.pos
+        if tag in self_closing_tags:
+            # self-closing tags will not be added to the result tags,
+            # they only appear in semantic_breaks
+            # the two self-closing tags br and hr both have lvl and sep
+            if i == 1:  # replace the default semantic break at pos 0
+                i = 0
+            self.add_semantic_break(i, lvl)
+            i += 1
+            if tag_id := attrs.get('id'):
+                self.tag_id = i, tag_id
+                self.add_tag_id(i)  # br or hr may have an id, too
+            self.add_space = True
+        else:
+            self.stack.append((i, tag, sep, lvl, sem, attrs))
+            # forget outdated tag id at new semantic break
+            if lvl:
+                self.forget_tag_id()
+            # memorize tag id
+            if not self.tag_id and (tag_id := attrs.get('id')):
+                self.tag_id = self.pos, tag_id
+
+    def handle_endtag(self, tag):
+        """
+        Called for each closing tag.
+        """
+        if not self.stack or (self.stack and self.stack[-1][1] != tag):
+            return  # nothing to do for an already closed self-closing tag
+        i, tag_, sep, lvl, sem, attrs = self.stack.pop()
+        f = self.pos
+        # omit tag without content
+        if i == f:
+            return
+        # for a closing div tag revise lvl to minimum level of contained
+        # semantic breaks (if any)
+        if tag == 'div':
+            min_lvl = 101
+            for pos_, lvl_ in reversed(self.semantic_breaks.items()):
+                if pos_ <= i:
+                    break
+                min_lvl = min(min_lvl, lvl_)
+            if min_lvl < 101:
+                lvl = min_lvl
+        # add semantic break and an optional section_id
+        if lvl:
+            if i == 1:  # replace the default semantic break at pos 0
+                i = 0
+            if tag in ('ul', 'ol', 'li'):
+                seen_tags = [x[1] for x in self.stack]
+                if 'p' not in seen_tags:
+                    lvl = 52 + seen_tags.count('tag')
+                    if tag == 'li':
+                        lvl += 1
+            self.add_semantic_break(i, lvl)
+            self.add_tag_id(i)
+        # do not include surrounding spaces in tag span
+        if self.text[i] == ' ':
+            i += 1
+        # add tag
+        self.tags[(i, f)][tag] = sem
+        # add space (when handling next data)
+        if sep:
+            self.add_space = True
+        # collect links
+        if tag == 'a':
+            self.extract_link(i, attrs)
+
+    def handle_data(self, text):
+        """
+        Called for each non-tag content between tags.
+        """
+        # handle empty or blacklisted text
+        if text == '':
+            return
+        if text == ' ':
+            self.add_space = True
+            return
+        if text.strip().lower() in text_blacklist:
+            if ' ' in text:
+                self.add_space = True
+            return
+        # add a space (at self.pos) if the text begins with one
+        # or if we shall add one
+        startswith_space = text.startswith(' ')
+        text = text.lstrip()
+        if startswith_space or self.add_space:
+            if self.text[-1] != ' ':
+                self.text += ' '
+                self.pos += 1
+        self.add_space = False
+        # strip a space at the end of text and handle it in end tag
+        if text.endswith(' '):
+            text = text[:-1]
+            self.add_space = True
+        # add text to self.text
+        self.text += text
+        self.pos += len(text)
+
+    def add_semantic_break(self, pos, lvl):
+        """
+        Add a semantic break of level *lvl* at position *pos*.
+        """
+        if pos in self.semantic_breaks:
+            self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl)
+        else:
+            self.semantic_breaks[pos] = lvl
+
+    def forget_tag_id(self):
+        """
+        Reset a tag id if it is too far behind in the text stream.
+        """
+        if self.tag_id:
+            pos_, tag_id = self.tag_id
+            if pos_ + 200 < self.pos:
+                self.tag_id = None
+
+    def add_tag_id(self, pos):
+        """
+        Add and clear an id if the just closing section has none yet.
+
+        *pos* is the start position of the current section, and the
+        position where the id will be added.
+
+        Add an id only if we are not too far in the section's text already.
+        """
+        if self.tag_id:
+            pos_, tag_id = self.tag_id
+            if pos_ < pos + 100 and pos not in self.section_ids:
+                self.section_ids[pos].append(tag_id.lower())
+        self.tag_id = None
+
+    def extract_link(self, i, attrs):
+        """
+        Add a link covering character range (i, self.pos).
+
+        From html *attrs* extract href and rel.
+        """
+        if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow':
+            if href.startswith('#'):
+                return
+            if len(href) > MAX_HREF_LENGTH:
+                return
+            attrs.get('title', '')
+            if rel := attrs.get('rel'):
+                if set(rel) & nofollow_link_rels:
+                    return
+            self.links[href] = i, self.pos, rel
+
+
+def annotate(html):
+    """
+    Split html text into plain text with annotations (from AnnotatingParser).
+    """
+    parser = AnnotatingParser()
+    parser.reset()
+    parser.feed(html)
+    parser.close()
+    return parser.text, parser.annotations
+
+
+re_footnote = re.compile(r'^\s*\[\d+\]\s+')
+
+
+def headline_probability(text, tags, lvl) -> float:
+    """
+    Estimate the probability that the text with tags is a headline.
+
+    The context is not considered: The question is not whether the
+    text is a headline for the following text.
+    """
+    text = text.strip()
+    res = 0.0
+    if not text:
+        return res
+    if lvl < 60:
+        return 1.0
+    # if 'h1' in tags or 'h2' in tags or 'h3' in tags or\
+    #    'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags:
+    #        return 1.0
+    if len(text) < 80:
+        res = 0.7
+    else:
+        res = 0.7 - 0.7 * (len(text) - 80) / 200
+    if 'p' in tags:
+        res -= 0.4
+    if 'em' in tags:
+        res += 0.3
+    if 'a' in tags:
+        res -= 0.1
+    if text[-1] in '.:':
+        res -= 0.3
+    res -= 0.1 * text.count(', ')
+    if re_footnote.match(text):
+        res -= 0.4
+    return max(res, 0.0)
+
+
+def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]:
+    """
+    Return the info on the share of characters covered with one of the *tags*.
+
+    Only consider the characters between i and f of string *text*.
+
+    Return the number of tags that have an overlap in the specified region,
+    the tag density in the region (fraction of covered characters by all),
+    and the average number of covered chars per tag.
+
+    NB: If more than one tag name is given, then the fractional share
+    may exceed 1.
+    """
+    if i == f:
+        return 0, 0.0, 0.0
+    tag_count = 0
+    covered_chars = 0
+    for (s_i, s_f), anns in tags.items():
+        if overlap := range_overlap(i, f - 1, s_i, s_f - 1):
+            for ann in anns:
+                if ann in tag_names:
+                    tag_count += 1
+                    covered_chars += overlap[1] - overlap[0]
+    all_chars = f - i
+    tag_density = covered_chars * 1.0 / all_chars
+    avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0
+    return tag_count, tag_density, avg_text_len
+
+
+def range_overlap(i1, f1, i2, f2):
+    """
+    Return the overlap of both ranges (None if there is none).
+    """
+    return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2))
+
+
+def annotations_remove_section(annotations, i, f):
+    """
+    Remove section (i, f) from annotations and return result.
+    """
+    new_annotations = {}
+    d = f - i
+    if not d:
+        return annotations
+
+    # relocate tags
+    new_tags = {}
+    for (t_i, t_f), anns in annotations['tags'].items():
+        n_i, n_f = cut_range(i, f, d, t_i, t_f)
+        if n_i is not None:
+            new_tags[(n_i, n_f)] = anns
+    new_annotations['tags'] = new_tags
+
+    # relocate links
+    new_links = {}
+    for href, (l_i, l_f, rel) in annotations['links'].items():
+        n_i, n_f = cut_range(i, f, d, l_i, l_f)
+        if n_i is not None:
+            new_links[href] = n_i, n_f, rel
+
+    # relocate semantic breaks and section_ids
+    semantic_breaks = annotations['semantic_breaks']
+    section_ids = annotations['section_ids']
+    new_semantic_breaks = {}
+    new_section_ids = {}
+    for pos in sorted(semantic_breaks.keys()):
+        level = semantic_breaks[pos]
+        if i <= pos and pos < f:
+            continue  # discard
+        elif f <= pos:
+            new_semantic_breaks[pos - d] = level
+            if pos in section_ids:
+                new_section_ids[pos - d] = section_ids[pos]
+        else:
+            new_semantic_breaks[pos] = level
+            if pos in section_ids:
+                new_section_ids[pos] = section_ids[pos]
+
+    # collect and return results
+    new_annotations['semantic_breaks'] = new_semantic_breaks
+    new_annotations['section_ids'] = new_section_ids
+    new_annotations['links'] = new_links
+    return new_annotations
+
+
+def cut_range(i, f, d, t_i, t_f):
+    """
+    Return the new coordinates of a text range (t_i,t_f) after cutting (i,f).
+
+    If (t_i,t_f) is fully within (i,f), return None, None.
+    """
+    if t_f < i:
+        return t_i, t_f
+    elif t_i < i <= t_f <= f:
+        return t_i, i
+    elif t_i < i and f <= t_f:
+        return t_i, t_f - d
+    elif i <= t_i and t_f <= f:
+        return None, None
+    elif i <= t_i <= f < t_f:
+        return i, t_f - d
+    else:  # f < t_i
+        return t_i - d, t_f - d
+
+
+def clean_annotations(annotations: dict) -> None:
+    """
+    Remove void stuff from annotations.
+    """
+    cleaned_tags = {}
+    for (i, f), anns in annotations['tags'].items():
+        if f > i and anns:
+            cleaned_tags[(i, f)] = anns
+    annotations['tags'] = cleaned_tags
+
+
+def pack_annotations(annotations):
+    """
+    Pack annotations to a special JSON string, reducing their volume a little.
+    """
+    return json_dumps(
+        {
+            'tags': _pack_tags(annotations['tags']),
+            'semantic_breaks': ','.join(
+                [
+                    f'{pos}:{level}'
+                    for pos, level in annotations['semantic_breaks'].items()
+                ]
+            ),
+            'section_ids': annotations['section_ids'],
+            'links': annotations['links'],
+        }
+    )
+
+
+def _pack_tags(tags: dict) -> str:
+    """
+    Utility function for packing tag information into a string.
+    """
+    res = ''
+    for (i, f), anns in tags.items():
+        if anns:
+            anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()])
+            res += f'{i}-{f}:{anns_}\n'
+    return res
+
+
+def unpack_annotations(json_text: str) -> dict:
+    """
+    Unpack tag information from a string.
+    """
+    annotations = json_loads(json_text)
+    tags = {}
+    for line in annotations['tags'].split('\n'):
+        if line:
+            range_, anns_ = line.split(':')
+            i, f = range_.split('-')
+            i = int(i)
+            f = int(f)
+            anns = {}
+            if anns_:
+                for ann_ in anns_.split(','):
+                    tag_, sem_ = ann_.split('=')
+                    anns[tag_] = sem_
+        tags[(i, f)] = anns
+    semantic_breaks = {}
+    for sb_ in annotations['semantic_breaks'].split(','):
+        pos_, lvl_ = sb_.split(':')
+        semantic_breaks[int(pos_)] = int(lvl_)
+    return {
+        'tags': tags,
+        'semantic_breaks': semantic_breaks,
+        'section_ids': annotations['section_ids'],
+        'links': annotations['links'],
+    }
--- a/src/atextcrawler/utils/date_finder.py
+++ b/src/atextcrawler/utils/date_finder.py
@ -0,0 +1,90 @@
+"""
+Find date expressions in a string.
+"""
+
+import re
+from datetime import datetime
+from typing import Optional
+
+p_day = r'(0?[1-9]|[12][0-9]|3[01])'
+p_month = r'(0?[1-9]|1[0-2])'
+p_year = r'(20\d\d|19\d\d)'
+sep = r'\D{1,2}'
+p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
+
+
+format_re = {
+    'iso': (
+        re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
+        (1, 2, 3, 6, 7),
+    ),
+    'dmy': (
+        re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
+        (3, 2, 1, 6, 7),
+    ),
+    'mdy': (
+        re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
+        (3, 1, 2, 6, 7),
+    ),
+}
+
+
+lang_format = {
+    'de': ('iso', 'dmy'),
+    'en': ('iso', 'mdy'),
+    None: ('iso', 'dmy', 'mdy'),
+}
+
+
+def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
+    """
+    Extract the latest date compatible with the *lang* from *text*.
+
+    Only consider dates in the past.
+    """
+    dates = extract_dates(text, lang=lang)
+    return max(dates) if dates else None
+
+
+def extract_dates(text: str, lang: str = None) -> list[datetime]:
+    """
+    Extract dates form a string, optionally limiting formats to a language.
+    """
+    dates = []
+    fmts = lang_format.get(lang, lang_format[None])
+    for fmt in fmts:
+        re_, slots = format_re[fmt]
+        matches = re_.findall(text)
+        if matches:
+            for match in matches:
+                try:
+                    date = datetime(
+                        int(match[slots[0]]),
+                        int(match[slots[1]]),
+                        int(match[slots[2]]),
+                        int(match[slots[3]] or 0),
+                        int(match[slots[4]] or 0),
+                    )
+                    if date <= datetime.utcnow():
+                        dates.append(date)
+                except:
+                    pass
+    return dates
+
+
+## from htmldate import find_date
+
+# def extract_last_pub(html):
+#    """
+#    Return an estimate for the time of last content publication from html.
+#    """
+#    # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
+#    lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
+#    # publication date (from startpage)
+#    try:
+#        date_string = find_date(lxml_tree)
+#        pd = date.fromisoformat(date_string)
+#        last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
+#    except:
+#        last_pub = None
+#    return last_pub
--- a/src/atextcrawler/utils/durl.py
+++ b/src/atextcrawler/utils/durl.py
@ -0,0 +1,278 @@
+"""
+Hyperlink parsing.
+"""
+
+import logging
+from typing import Optional
+from urllib.parse import urlsplit
+
+import tldextract
+from async_dns import types
+from async_dns.resolver import ProxyResolver
+from async_lru import alru_cache
+
+from .link import in_blacklist
+
+logger = logging.getLogger(__name__)
+
+
+resolver = ProxyResolver(request_timeout=2)
+
+
+async_dns_logger = logging.getLogger('async_dns')
+async_dns_logger.setLevel(logging.WARNING)
+
+
+extract = tldextract.TLDExtract(cache_dir=False)
+
+
+# tldextract uses filelock; set its loglevel to warning
+filelock_logger = logging.getLogger('filelock')
+filelock_logger.setLevel(logging.WARNING)
+
+
+class Durl:
+    """
+    Decomposed URL, contains :class:`urllib.parse.SplitResult`.
+
+    When constructing this class, it has to be awaited, e.g.:
+
+         my_durl = await Durl('http://www.example.com/whatever')
+
+    The given URL will be decomposed, validated and normalized.
+    If the URL is invalid, we return None instead of an instance.
+
+    If the given *base* is None, the URL must be absolute and
+    the hostname must be valid (DNS lookup).
+
+    If the given URL is not absolute, an already decomposed (and thus
+    valid) *base* Durl must be given; otherwise the URL is invalid.
+
+    The *base* Durl can contain a path (but no arguments or fragments),
+    in which case the URL - if not absolute - must begin with this path.
+
+    The scheme must be http or https. If the URL begins with '//',
+    'http:' is prepended.
+
+    If the hostname is longer than 90 characters, the URL is invalid.
+
+    Default port numbers (80 for http, 443 for https) are removed.
+
+    The hostname is changed to lower case. Spaces in the hostname
+    make the URL invalid.
+
+    URL fragments are removed.
+    """
+
+    _url = None
+    _base = None
+    _match_base = False
+
+    def __init__(
+        self,
+        url: str,
+        base: Optional['Durl'] = None,
+        match_base: bool = False,
+    ):
+        self._url = url
+        self._base = base
+        self._match_base = match_base
+
+    def __await__(self):
+        return self.__ainit__().__await__()
+
+    async def __ainit__(self):
+        res = None
+        try:
+            # add missing scheme for urls beginning with '//'
+            if self._url.startswith('//'):
+                self._url = 'http:' + self._url
+            # split the url
+            durl = urlsplit(self._url)
+            # remove default port numbers 80, 443
+            netloc = durl.netloc
+            if durl.port == 80 and durl.scheme == 'http':
+                netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
+            if durl.port == 443 and durl.scheme == 'https':
+                netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
+            if durl.hostname and durl.hostname != durl.netloc.lower():
+                user_pass = ''
+                if durl.username and durl.password:
+                    user_pass = f'{durl.username}:{durl.password}@'
+                port = ''
+                if durl.port:
+                    port = f':{durl.port}'
+                netloc = f'{user_pass}{durl.hostname.lower()}{port}'
+            durl = durl._replace(netloc=netloc)
+
+            if self._base:
+                # if missing fill in scheme and netloc from base
+                if not durl.scheme:
+                    durl = durl._replace(scheme=self._base.scheme)
+                if not durl.netloc:
+                    durl = durl._replace(netloc=self._base.netloc)
+                # if match_base, then set res only if the
+                # url is compatible with base url
+                if not self._match_base:
+                    res = durl
+                else:
+                    if durl.netloc == self._base.netloc:
+                        if durl.scheme == self._base.scheme:
+                            if self._base.path not in ('/', ''):
+                                if durl.path.startswith(self._base.path):
+                                    res = durl
+                            else:
+                                res = durl
+            else:
+                res = durl
+        except:
+            logger.exception(
+                f'Durl init failed url={self._url}'
+                f' base={self._base} match_base={self._match_base}'
+            )
+            res = None
+        if res:
+            res = res._replace(fragment='')
+            if not res.hostname or len(res.hostname) > 90:
+                res = None
+            elif res.scheme not in ('https', 'http'):
+                res = None
+            elif ' ' in res.hostname or '.' not in res.hostname:
+                res = None
+            elif not (await get_ips(res.hostname)):
+                res = None
+            elif not res.path.startswith('/'):
+                res = res._replace(path='/')
+        if res:
+            if res.fragment is None:
+                res.fragment = ''
+            self._durl = res
+            return self
+        self._durl = None
+
+    def __getattr__(self, attr):
+        return getattr(self._durl, attr)
+
+    def url(self) -> str:
+        """
+        Return the URL as string.
+        """
+        return self._durl.geturl()
+
+    def pwa(self) -> str:
+        """
+        Return the (base-relative) path with args of the Durl.
+        """
+        if self._base and self._match_base:
+            path = self._durl.path.removeprefix(self._base.path)
+        else:
+            path = self._durl.path
+        qs = f'?{self._durl.query}' if self._durl.query else ''
+        return f'{path}{qs}'.lstrip('/')
+
+    def has_path(self) -> bool:
+        """
+        Return whether the Durl has a non-trivil path.
+        """
+        return self._durl.path not in ('/', '')
+
+    def site(self) -> str:
+        """
+        Return the site (base_url).
+        """
+        return f'{self._durl.scheme}://{self._durl.netloc}/'
+
+    def domain(self) -> str:
+        """
+        Return the domain of the Durl (wrong in case of second-level domains).
+        """
+        levels = extract(self._durl.hostname)
+        return '.'.join(levels[-2:]).lower()
+
+    def replace_scheme(self, scheme: str) -> None:
+        """
+        Replace the scheme (must be 'http' or 'https').
+        """
+        self._durl = self._durl._replace(scheme=scheme)
+
+
+@alru_cache(maxsize=1000)
+async def get_ips(hostname: str) -> set[str]:
+    """
+    Return IPv4 and IPv6 addresses of the given hostname.
+    """
+    ips = set()
+    for type_ in (types.A, types.AAAA):
+        try:
+            res, cached = await resolver.query(hostname, type_)
+            if res:
+                if addr := res.get_record([type_]):
+                    ips.add(addr.data)
+        except:
+            pass
+    return ips
+
+
+def get_url_variants(url: str) -> list[str]:
+    """
+    Return variants of the URL.
+
+    Replace http with https and vice versa;
+    prepend or remove 'www.' to or from the beginning of the hostname.
+    """
+    if url.startswith('http://www.'):
+        s = url.removeprefix('http://www.')
+        return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
+    elif url.startswith('http://'):
+        s = url.removeprefix('http://')
+        return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
+    elif url.startswith('https://www.'):
+        s = url.removeprefix('https://www.')
+        return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
+    elif url.startswith('https://'):
+        s = url.removeprefix('https://')
+        return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
+    else:
+        return [url]
+
+
+async def assort_links(
+    links: dict[str, tuple[int, int, list[str]]],
+    durl: Durl,
+    text: str,
+    base_url: str = None,
+) -> tuple[
+    dict[str, tuple[int, int, list[str]]],
+    dict[Durl, tuple[list[str], str]],
+    dict[Durl, tuple[list[str], str]],
+]:
+    """
+    Sort links into a cleaned, an internal and an external dict.
+
+    The cleaned dict maps absolute URLs to char ranges and relations.
+    The internal dict maps absolute URLs to relations and the linked text.
+    The external dict maps absolute URLs to relations and the linked text.
+    The relations are link relations, e.g. rel="canonical".
+
+    The base_url is set, it is used to distinguish internal and external
+    links. If it is not set, the base_url is obtained from *durl*.
+    """
+    res_int = {}
+    res_ext = {}
+    if not base_url:
+        base_url = durl.site().lower()
+    base_durl = await Durl(base_url)
+    cleaned_links = {}
+    for href, (i, f, rel) in links.items():
+        durl = await Durl(href, base=base_durl)
+        if not durl:
+            continue
+        if durl.hostname and in_blacklist(durl.hostname):
+            continue
+        cleaned_links[durl.url()] = i, f, rel
+        txt = text[i:f]
+        if durl.site().lower() == base_url:
+            res_int[durl] = rel, txt
+        else:
+            res_ext[durl] = rel, txt
+    return cleaned_links, res_int, res_ext
--- a/src/atextcrawler/utils/html.py
+++ b/src/atextcrawler/utils/html.py
@ -0,0 +1,136 @@
+"""
+Utilities for extracting information from html.
+"""
+
+import re
+from html import unescape
+from typing import Optional
+
+from bs4 import BeautifulSoup
+
+from .lang import clean_lang
+from .tag import drop_roles, drop_tags, keep_tags
+
+re_ = {
+    'html_lang': re.compile(
+        '<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
+    ),
+    'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
+    'strip': re.compile(
+        '<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
+    ),
+    'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
+    'whitespace': re.compile('(\s|&nbsp;)+', re.S),
+    'whitespace_': re.compile('\s|&nbsp;?'),  # allow broken &nbsp
+    'whitespace_near_tag': re.compile(
+        '\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
+        '|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
+        re.S,
+    ),
+    'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
+    'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
+    'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
+}
+
+
+def whitespace_tag_tag(match_obj):
+    """
+    Helper function for removing whitespace between tags.
+    """
+    return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
+
+
+def clean_html(s: Optional[str]) -> Optional[str]:
+    """
+    Clean an html string.
+
+    Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
+
+    See also: https://www.lesinskis.com/python-unicode-whitespace.html
+    """
+    return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
+
+
+def get_html_lang(html: str) -> Optional[str]:
+    """
+    Return the language, if any, found in the lang attribute of the html tag.
+    """
+    m = re_['html_lang'].search(html)
+    return clean_lang(m.group(1)) if m else None
+
+
+def extract_title(html: str) -> Optional[str]:
+    """
+    Extract title tags from html returning their content as a string.
+    """
+    if not (titles := re_['title'].findall(html)):
+        return None
+    titles = [clean_html(title) for title in reversed(titles) if title]
+    return ' - '.join(titles).strip(' |')
+
+
+def clean_page(html):
+    """
+    Remove unwanted tags including their content from html.
+
+    Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
+    Also drop tags with attribute aria-hidden=true.
+
+    Return a beautiful soup.
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+    for tag in drop_tags:
+        for n in soup.find_all(tag):
+            n.decompose()
+    for n in soup.find_all(attrs={'aria-hidden': 'true'}):
+        n.decompose()
+    for role in drop_roles:
+        for n in soup.find_all(attrs={'rel': role}):
+            n.decompose()
+    return soup
+
+
+def clean_body(body):
+    """
+    Clean an html body.
+
+    Remove unwanted tags (keeping their content); remove empty tags;
+    remove and replace whitespaces in several ways.
+
+    In the end the only whitespace is a space and there are no
+    consecutive spaces.
+    """
+    body = re_['strip'].sub(' ', body)
+    body = re_['whitespace_near_tag'].sub(r'<\1>', body)
+    body = re_['whitespace'].sub(' ', body)
+    while re_['empty_tag'].search(body):
+        body = re_['empty_tag'].sub(r'\3', body)
+    body = re_['whitespace_near_tag'].sub(r'<\1>', body)
+    body = re_['whitespace'].sub(' ', body)
+    body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
+    return body.strip().replace('\u00ad', '')  # soft hyphen
+
+
+def get_html_redirect(html: str) -> Optional[str]:
+    """
+    Return an html redirect in an http-equiv meta tag.
+
+    If none is found, return None.
+    """
+    redir_url = None
+    http_equivs = re_['http_equiv'].findall(html)
+    for raw in http_equivs:
+        tag = BeautifulSoup(raw, 'html.parser').meta
+        if tag and tag.get('http-equiv', '').lower() == 'refresh':
+            if content := tag.get('content'):
+                try:
+                    _, redir_url = content.split(';')
+                    redir_url = (
+                        redir_url.strip()
+                        .removeprefix('url=')
+                        .removeprefix('URL=')
+                        .strip("'")
+                    )
+                except:
+                    pass
+    return redir_url
--- a/src/atextcrawler/utils/http.py
+++ b/src/atextcrawler/utils/http.py
@ -0,0 +1,58 @@
+"""
+Utility functions related to http.
+"""
+
+import re
+from typing import Optional
+
+from multidict import CIMultiDictProxy
+
+from ..models import Site
+from .durl import Durl
+
+re_ = {
+    'link_header': re.compile(',\s*(?=<)'),
+    'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
+    'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
+}
+
+
+async def get_header_links(
+    headers: CIMultiDictProxy,
+    durl: Durl,
+    site: Optional[Site],
+) -> dict[str, Optional[str]]:
+    """
+    Extract canonical and shortlink links from http headers.
+
+    *durl* must be the Durl of the fetched page and *site* - i fnon None -
+    must be the Site to which the page belongs.
+
+    Return a (default)dict with 'canonical' and 'shortlink' as keys.
+    The values default to None.
+    """
+    res = {}
+    canonical = shortlink = None
+    if 'link' in headers and (link_headers := headers.getall('link')):
+        links = []
+        for link_header in link_headers:
+            links += re_['link_header'].split(link_header)
+        url = durl.url()
+        base_url = site.base_url if site else url
+        base_durl = await Durl(base_url) if base_url else None
+        for link in links:
+            if not canonical and 'canonical' in link.lower():
+                if re_['rel_canonical'].search(link):
+                    canon_url = link.strip().lstrip('<').split('>')[0]
+                    if canon_durl := await Durl(canon_url, base=base_durl):
+                        canonical = canon_durl.url()
+            if not shortlink and 'shortlink' in link.lower():
+                if re_['rel_shortlink'].search(link):
+                    short_url = link.strip().lstrip('<').split('>')[0]
+                    if short_durl := await Durl(short_url, base=base_durl):
+                        shortlink = short_durl.url()
+            if canonical and shortlink:
+                break
+    res['canonical'] = canonical
+    res['shortlink'] = shortlink
+    return res
--- a/src/atextcrawler/utils/json.py
+++ b/src/atextcrawler/utils/json.py
@ -0,0 +1,32 @@
+"""
+Custom JSON encoder.
+"""
+
+import json
+
+
+class JSONEncoderExt(json.JSONEncoder):
+    """
+    Extended JSON encoder with encoding of sets as lists.
+    """
+
+    def default(self, obj):
+        """
+        Encode sets as lists and everything else as by default.
+        """
+        if isinstance(obj, set):
+            return list(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+def json_dumps(obj):
+    """
+    Encode an object to a JSON string using JSONEncoderExt.
+    """
+    return json.dumps(obj, cls=JSONEncoderExt)
+
+
+json_loads = json.loads
+"""
+Decoding of JSON strings as by default.
+"""
--- a/src/atextcrawler/utils/lang.py
+++ b/src/atextcrawler/utils/lang.py
@ -0,0 +1,44 @@
+"""
+Utility functions related to languages.
+"""
+
+from pathlib import Path
+from typing import Optional
+
+import gcld3
+
+asset_path = Path(__file__).parent.parent / 'assets'
+
+
+with open(asset_path / 'iso_639-1', 'r') as f:
+    iso_639_1_codes = f.read().strip().split('\n')
+
+
+lang_detector = gcld3.NNetLanguageIdentifier(
+    min_num_bytes=0, max_num_bytes=1000
+)
+
+
+def clean_lang(lang: Optional[str]) -> Optional[str]:
+    """
+    Clean a language code string: it must be an ISO 639-1 code or None.
+    """
+    if lang is None:
+        return None
+    lang = lang[:2].lower()
+    if lang in iso_639_1_codes:
+        return lang
+    return None
+
+
+def extract_content_language(text: str) -> Optional[str]:
+    """
+    Extract the language from a text.
+    """
+    if len(text) < 10:
+        return None
+    lang = None
+    lang_det = lang_detector.FindLanguage(text=text)
+    if lang_det.is_reliable:
+        lang = lang_det.language[:2]
+    return lang
--- a/src/atextcrawler/utils/link.py
+++ b/src/atextcrawler/utils/link.py
@ -0,0 +1,116 @@
+"""
+Hyperlinks (a href, link).
+"""
+
+from pathlib import Path
+from typing import Optional
+
+import tldextract
+
+nofollow_link_rels = set(
+    [
+        'nofollow',
+        'search',
+        'noreferrer',
+        'noopener',
+        'help',
+        'license',
+    ]
+)
+"""
+Do not follow the hrefs in anchor tags with these values of the rel attribute.
+"""
+
+
+meta_names = (
+    'generator',
+    'lang',
+    'language',
+    'description',
+    'keywords',
+    'author',
+    'title',
+    'subject',
+    'revised',
+    'abstract',
+    'topic',
+    'summary',
+    'classfication',
+    'category',
+    'reply-to',
+    'owner',
+    'url',
+    'identifier-URL',
+    'geo.position',
+    'geo.region',
+    'geo.placename',
+    'dc.language',
+)
+"""
+Values of the name attribute of meta tags to keep.
+
+See also: https://gist.github.com/lancejpollard/1978404
+See also: https://github.com/joshbuchea/HEAD
+"""
+
+
+meta_props = (
+    'og:site_name',
+    'og:locale',
+    'og:type',
+    'og:latitude',
+    'og:longitude',
+    'og:street',
+    'og:locality',
+    'og:region',
+    'og:postal',
+    'og:country',
+)
+"""
+Values of the property attribute of meta tags to keep.
+"""
+
+
+link_rels = set(
+    [
+        'webmention',
+        'pingback',
+        'alternate',
+        'canonical',
+        'author',
+    ]
+)
+"""
+Values of the rel attribute of link tags to keep.
+"""
+
+
+def load_blacklist():
+    """
+    Return the 10000 most popular internet domains.
+    """
+    path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
+    with open(path, 'r') as file:
+        domains = file.read().strip().splitlines()
+    return domains
+
+
+domain_blacklist = load_blacklist()
+
+
+def in_blacklist(hostname: str) -> Optional[str]:
+    """
+    Return a match of host in the blacklist, or None.
+    """
+    domain = extract_domain(hostname)
+    if domain in domain_blacklist:
+        return hostname
+    return None
+
+
+def extract_domain(hostname: str) -> str:
+    """
+    Extract the lower-case domain from a hostname.
+    """
+    levels = tldextract.extract(hostname)
+    return '.'.join(levels[-2:]).lower()
--- a/src/atextcrawler/utils/muse.py
+++ b/src/atextcrawler/utils/muse.py
@ -0,0 +1,120 @@
+"""
+Parse muse-formatted plaintext (delivered by amusewiki).
+"""
+
+import re
+from datetime import datetime
+from typing import Optional
+
+from .date_finder import extract_latest_date
+from .lang import clean_lang
+
+re_tag = re.compile(r'<[^<]+?>')
+
+
+def parse_muse(text: str) -> Optional[tuple[dict, str]]:
+    """
+    Parse a MUSE string returning meta information and the text body.
+    """
+    head, body = split_head_body(text)
+    if not head:
+        return None
+    meta = parse_head(head)
+    if not meta:
+        return None
+    return extract_muse_meta(meta, body), body
+
+
+def split_head_body(text: str) -> tuple[str, str]:
+    """
+    Split a MUSE string into head and body and return both.
+    """
+    head = ''
+    while text.startswith('#'):
+        line_end = text.find('\n') + 1
+        head += text[:line_end]
+        text = text[line_end:]
+    return head.strip(), text.strip()
+
+
+def parse_head(text: str) -> dict:
+    """
+    Parse a MUSE head and return a dict mapping field names to values.
+    """
+    fields = {}
+    for line in text.split('\n'):
+        name, value = line.strip().split(' ', 1)
+        fields[name[1:]] = value
+    return fields
+
+
+amusewiki_fields = [
+    'author',
+    'title',
+    'lang',
+    'LISTtitle',  # reduced title for alphabetical sorting
+    'subtitle',
+    'SORTauthors',  # authors separated by ';' or ',' (only for indexing)
+    'SORTtopics',  # topics separated by ';' or ',' (only for indexing)
+    'date',  # publication year
+    'pubdate',  # publication datetime
+    'notes',  # additional info (orig title, translators, credits, ...)
+    'source',  # preferred format: "Retrieved on March 8, 2012 from {URL}"
+    'publisher',
+    'isbn',
+    #'rights',
+    'seriesname',
+    'seriesnumber',
+    #'hyphenation',       # irrelevant
+    #'slides',            # irrelevant
+    #'DELETED',           # irrelevant
+    #'cover',             # irrelevant
+    #'coverwidth',        # irrelevant
+    #'nocoverpage',       # irrelevant
+    #'notoc',             # irrelevant
+    #'nofinalpage',       # irrelevant
+    #'impressum',         # irrelevant
+    #'continuefootnotes', # irrelevant
+    #'centerchapter',     # irrelevant
+    #'centersection',     # irrelevant
+]
+"""
+Amusewiki fields are (cf. https://amusewiki.org/library/manual)
+"""
+
+
+re_list = re.compile('[;,]')
+
+
+def extract_muse_meta(meta, body) -> dict:
+    """
+    Extract meta information from muse header and muse body.
+    """
+    authors = set()
+    if author := meta.get('author', '').strip():
+        authors.add(author)
+    if sortauthors := meta.get('SORTauthors', '').strip():
+        for author in re_list.split(sortauthors):
+            if author_ := author.strip():
+                authors.add(author_)
+    pubdate = meta.get('pubdate').strip()
+    pub_date: Optional[datetime] = None
+    if pubdate:
+        try:
+            pub_date = datetime.fromisoformat(pubdate)
+        except:
+            pub_date = extract_latest_date(pubdate)
+    summary = re_tag.sub('', body[:1000].split('\n\n')[0])
+    return {
+        'title': re_tag.sub('', meta.get('title', '')) or None,
+        'authors': authors,
+        'lang': clean_lang(meta.get('lang')),
+        'keywords': [
+            s.strip()
+            for s in re_list.split(meta.get('SORTtopics', '').strip())
+            if s.strip()
+        ],
+        'pub_date': pub_date,
+        'summary': summary,
+        'orig_source': meta.get('source', '').strip() or None,
+    }
--- a/src/atextcrawler/utils/probe.py
+++ b/src/atextcrawler/utils/probe.py
@ -0,0 +1,22 @@
+"""
+Utility functions for probing / sampling.
+"""
+
+
+def extract_samples(items, n=5):
+    """
+    Extract up to n sample elements from the the given dict or list.
+
+    If *items* is a dict return the elements from the list of keys.
+    """
+    l = len(items)
+    if l <= n:
+        return items
+    poss = []
+    step = (l + 1) / n
+    for i in range(n):
+        pos = int(step * i)
+        if pos < l and (not poss or pos > poss[-1]):
+            poss.append(pos)
+    items_list = list(items)
+    return [items_list[pos] for pos in poss]
--- a/src/atextcrawler/utils/section.py
+++ b/src/atextcrawler/utils/section.py
@ -0,0 +1,74 @@
+"""
+Operations on text sections.
+
+Semantic breaks are character positions within a text (0-offset)
+where a new section begins. More precisely, the character position
+contains a space and only at the next position begins a tag that is
+semantically breaking (e.g., a h1 or a br).
+
+Each semantic break has a level, which means breaking strength.
+The lower the level (e.g., h1 has a lower level than h2), the
+stronger the break.
+
+Implicitly, if position 0 has no semantic break, a semantic break
+at position 0 with level 80 is added.
+
+Semantic breaks can be used to split a text into sections.
+The lower the maximum level of the semantic breaks taken into account,
+the coarser the segmentation and the fewer the sections.
+Each section is given the level of the semantic break at ist beginning.
+
+From another point of view, sections have levels indicating
+the segmentation depth.
+
+The levels for html tags are defined in tag.py.
+
+The *semantic_breaks* argument in the functions below
+is a dict mapping the character position of the semantic break
+to the level of a section beginning at this position
+(if segmentation is done at this or a higher level).
+"""
+
+
+def iter_sections(text, semantic_breaks, max_level=59):
+    """
+    Iterate over sections, limiting to those with a maximum level.
+
+    Yield (start_pos, end_pos, level, text).
+    *text* is assumed to have the first semantic break at position 0.
+    """
+    n = len(text)
+    last_pos = 0
+    last_level = semantic_breaks.get(0, 80)
+    for pos, level in sorted(semantic_breaks.items()):
+        if level <= max_level and last_pos != pos:
+            yield last_pos, pos, last_level, text[last_pos + 1 : pos]
+            last_pos = pos
+            last_level = level
+    if last_pos < n:
+        yield last_pos, n, last_level, text[last_pos:]
+
+
+def concat_section_texts(text, semantic_breaks, min_len=2000):
+    """
+    Try to concat consecutive sections into chunks with a minimum length.
+
+    Yield (section_ids, combined_text).
+    """
+    n = len(text)
+    last_pos = 0
+    section_ids = []
+    for section_id, pos in enumerate(semantic_breaks.keys()):
+        if pos >= last_pos + min_len:
+            if n - pos < min_len:
+                for id_ in [
+                    i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
+                ]:
+                    section_ids.append(id_)
+                pos = n
+            yield section_ids, text[last_pos:pos]
+            last_pos = pos
+            section_ids = []
+        section_ids.append(section_id)
+    if last_pos < n:
+        yield section_ids, text[last_pos:]
--- a/src/atextcrawler/utils/similarity.py
+++ b/src/atextcrawler/utils/similarity.py
@ -0,0 +1,92 @@
+"""
+Text similarity with simhash.
+"""
+
+import logging
+
+from asyncpg import Connection
+from simhash import Simhash, SimhashIndex
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.ERROR)
+
+
+postgresql_bigint_offset = 9223372036854775808
+"""
+Subtract this number to get a PostgreSQL bigint from a 64bit int.
+"""
+
+
+def get_features(txt: str) -> list[str]:
+    """
+    Extract features from string for use with Simhash.
+    """
+    width = 3
+    txt = txt.replace(' ', '').lower()
+    return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))]
+
+
+def simhash_to_bigint(simhash: Simhash) -> int:
+    """
+    Convert a simhash to PostgreSQL's bigint value range.
+    """
+    return simhash.value - postgresql_bigint_offset
+
+
+def simhash_from_bigint(bigint: int) -> Simhash:
+    """
+    Convert a simhash from PostgreSQL's bigint to a Simhash instance.
+    """
+    return Simhash(bigint + postgresql_bigint_offset, log=logger)
+
+
+def get_simhash(text: str) -> Simhash:
+    """
+    Return the Simhash of the given text.
+    """
+    return Simhash(get_features(text), log=logger)
+
+
+async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex:
+    """
+    Return a simhash index with hashes of all stored resources of the site.
+    """
+    sql = (
+        "SELECT r.id, r.simhash FROM site_path sp, resource r"
+        " WHERE sp.site_id=$1 AND sp.resource_id=r.id"
+    )
+    rows = await conn.fetch(sql, site_id)
+    objs = [
+        (
+            str(row['id']),
+            Simhash(row['simhash'] + postgresql_bigint_offset, log=logger),
+        )
+        for row in rows
+    ]
+    return SimhashIndex(objs, k=3, log=logger)
+
+
+def create_simhash(
+    index: SimhashIndex,
+    resource_id: int,
+    simhash_instance: Simhash,
+) -> int:
+    """
+    Add a resource with given id and simhash to a simhash index.
+
+    Return the simhash value shifted into PostgreSQL's bigint range.
+
+    (The simhash field of the resource's database entry is not updated.)
+    """
+    index.add(str(resource_id), simhash_instance)
+    return simhash_to_bigint(simhash_instance)
+
+
+def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]:
+    """
+    Return the ids of similar resources from the index.
+    """
+    found = index.get_near_dups(simhash_inst)
+    if found:
+        return sorted([int(elem) for elem in found])
+    return []
--- a/src/atextcrawler/utils/tag.py
+++ b/src/atextcrawler/utils/tag.py
@ -0,0 +1,189 @@
+"""
+Information collections related to html tags.
+"""
+
+
+drop_tags = [
+    'applet',
+    'area',
+    'audio',
+    'base',
+    'basefont',
+    'bdi',
+    'bdo',
+    'button',
+    'canvas',
+    'code',
+    'command',
+    'data',
+    'datalist',
+    'dir',
+    'embed',
+    'fieldset',
+    'figure',
+    'form',
+    'frame',
+    'frameset',
+    'iframe',
+    'img',
+    'input',
+    'label',
+    'legend',
+    'map',
+    'menuitem',
+    'meter',
+    'noframes',
+    'noscript',
+    'object',
+    'optgroup',
+    'option',
+    'param',
+    'picture',
+    'progress',
+    'rp',
+    'rt',
+    'ruby',
+    'samp',
+    'script',
+    'select',
+    'source',
+    'style',
+    'svg',
+    'template',
+    'textarea',
+    'track',
+    'var',
+    'video',
+]
+"""
+Tags to drop, including their content.
+"""
+
+
+keep_tags = {
+    'a': (0, 0, ''),
+    'abbr': (0, 0, 'st'),
+    'acronym': (0, 0, 'st'),
+    'address': (1, 0, 'm'),
+    'article': (1, 15, ''),
+    'aside': (1, 0, 'd'),
+    'b': (0, 0, 'st'),
+    'blockquote': (1, 65, 'q'),
+    'br': (1, 80, ''),
+    'caption': (1, 68, ''),
+    'center': (1, 50, ''),
+    'cite': (1, 0, 'd'),
+    'col': (1, 75, ''),
+    'colgroup': (1, 73, ''),
+    'dd': (1, 70, 'li'),
+    'del': (0, 0, 'se'),
+    'details': (1, 0, 'd'),
+    'dfn': (0, 0, 'st'),
+    'div': (1, 60, ''),  # lvl often revised to min of contained tags
+    'dl': (1, 70, 'l'),
+    'dt': (1, 70, 'li'),
+    'em': (0, 0, 'st'),
+    'figcaption': (1, 0, ''),
+    'font': (0, 0, 's'),
+    'footer': (1, 15, ''),
+    'h1': (1, 30, ''),
+    'h2': (1, 32, ''),
+    'h3': (1, 34, ''),
+    'h4': (1, 36, ''),
+    'h5': (1, 38, ''),
+    'h6': (1, 40, ''),
+    'header': (1, 15, ''),
+    'hr': (1, 30, ''),
+    'i': (0, 0, 'st'),
+    'ins': (0, 0, 'se'),
+    'li': (1, 75, 'li'),  # lvl revised if not inside p
+    'main': (1, 10, ''),
+    'mark': (0, 0, 's'),
+    'nav': (1, 0, ''),  # keep for footnotes
+    'ol': (1, 70, 'l'),  # lvl revised if not inside p
+    'p': (1, 60, ''),
+    'pre': (1, 65, 'q'),
+    'q': (1, 0, 'q'),
+    's': (0, 0, ''),
+    'section': (1, 24, ''),
+    'small': (0, 0, 'd'),
+    'span': (0, 0, 's'),
+    'strike': (0, 0, 'se'),
+    'strong': (0, 0, 'st'),
+    'sub': (0, 0, ''),
+    'summary': (1, 20, 'm'),
+    'sup': (0, 0, ''),
+    'table': (1, 65, ''),
+    'tbody': (1, 70, ''),
+    'td': (1, 78, ''),
+    'tfoot': (1, 70, ''),
+    'th': (1, 75, ''),
+    'thead': (1, 70, ''),
+    'time': (0, 0, 'm'),
+    'tr': (1, 75, ''),
+    'u': (0, 0, 's'),
+    'ul': (1, 70, 'l'),  # lvl revised if not inside p
+}
+"""
+Tags to keep for annotation, and their properties.
+
+The properties are:
+
+  * sep: whether to separate text at both sides of the tag with a space
+  * lvl: structural depth level of content of this tag;
+         the paragraph level is 60; headings are below 60, listings above;
+         a div below the tag will usually have the tag's depth + 1
+  * sem: semantic categories: zero or more of
+    * s=span
+    * l=listing
+    * i=list_item
+    * t=term
+    * e=edit
+    * d=details
+    * q=quote
+    * m=meta
+    * x=exclude
+"""
+
+
+self_closing_tags = ('br', 'hr')
+"""
+Those among keep_tags which are self-closing.
+"""
+
+
+all_self_closing_tags = (
+    'area',
+    'base',
+    'br',
+    'col',
+    'embed',
+    'hr',
+    'img',
+    'input',
+    'link',
+    'meta',
+    'param',
+    'source',
+    'track',
+    'wbr',
+)
+"""
+All self-closing tags of the html standard.
+"""
+
+
+drop_roles = (
+    'banner',
+    'complementary',
+    'contentinfo',
+    'dialog',
+    'figure',
+    'form',
+    'img',
+    'search',
+    'switch',
+)
+"""
+Drop tags with these aria roles.
+"""
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1,7 @@
+from .annotation import AnnotateTest
+from .date_finder import DateFinderTest
+from .page import PageCleanTest
+from .section import IterSectionTest, AggSectionTest
+from .simhash import SimhashTest
+from .text import CleanHtmlTest
+from .durl import DurlTest
--- a/tests/annotation.py
+++ b/tests/annotation.py
@ -0,0 +1,49 @@
+"""
+Test cases for resource type page.
+"""
+
+from unittest import TestCase
+
+from atextcrawler.utils.annotation import annotate
+
+
+class AnnotateTest(TestCase):
+    """
+    Test annotation.
+
+    Consider that the <br> and <hr> tags are self-closing.
+    """
+
+    def test_annotate_1(self):
+        s = '<em>Hello</em><br><strong>world</strong>'
+        text, anns = annotate(s)
+        self.assertEqual(text, ' Hello world')
+        self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
+        self.assertEqual(anns['section_ids'], {})
+
+    def test_annotate_2(self):
+        s = '<em> Hello </em><br><strong> world </strong>'
+        text, anns = annotate(s)
+        self.assertEqual(text, ' Hello world')
+        self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
+        self.assertEqual(anns['section_ids'], {})
+
+    def test_annotate_3(self):
+        s = '<p> Hello <em>world</em> </p> '
+        text, anns = annotate(s)
+        self.assertEqual(text, ' Hello world')
+        self.assertEqual(anns['semantic_breaks'], {0: 60})
+
+    def test_annotate_4(self):
+        s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
+        text, anns = annotate(s)
+        self.assertEqual(text, ' Hello world')
+        self.assertEqual(anns['semantic_breaks'], {0: 60})
+        self.assertEqual(anns['section_ids'], {0: ['ref1']})
+
+    def test_annotate_5(self):
+        s = '<br id="ref2"> Hello <p>world </p> '
+        text, anns = annotate(s)
+        self.assertEqual(text, ' Hello world')
+        self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
+        self.assertEqual(anns['section_ids'], {1: ['ref2']})
--- a/tests/date_finder.py
+++ b/tests/date_finder.py
@ -0,0 +1,20 @@
+from datetime import datetime
+from unittest import TestCase
+
+from atextcrawler.utils.date_finder import extract_latest_date
+
+
+class DateFinderTest(TestCase):
+    def test_extract_latest_date(self):
+        s = 'test 1987-2+1-no'
+        r = datetime(1987, 2, 1)
+        self.assertEqual(extract_latest_date(s), r)
+        s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
+        r = datetime(2020, 4, 6)
+        self.assertEqual(extract_latest_date(s, lang='de'), r)
+        s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
+        r = datetime(2021, 1, 20)
+        self.assertEqual(extract_latest_date(s, lang='en'), r)
+        s = ''
+        r = None
+        self.assertEqual(extract_latest_date(s), r)
--- a/tests/durl.py
+++ b/tests/durl.py
@ -0,0 +1,68 @@
+from unittest import IsolatedAsyncioTestCase
+import asyncpg
+from atextcrawler.utils.durl import Durl
+from atextcrawler.config import Config
+from atextcrawler.db import PGPool
+
+
+class DurlTest(IsolatedAsyncioTestCase):
+    async def asyncSetUp(self):
+        config = Config().get()
+        self.pool = PGPool(config['postgresql'])
+        await self.pool.__aenter__()
+        self.conn = await self.pool.pool.acquire()
+
+    async def test_durl_basic(self):
+        durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
+        self.assertEqual(durl1.scheme, 'https')
+        self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
+        self.assertEqual(durl1.port, 8000)
+        self.assertEqual(durl1.path, '/hello')
+        self.assertEqual(durl1.fragment, '')
+        self.assertEqual(durl1.pwa(), 'hello?world')
+        self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
+        self.assertEqual(
+            durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
+        )
+        self.assertEqual(durl1.has_path(), True)
+        durl2 = await Durl('http://www.example.com/')
+        self.assertEqual(durl2.has_path(), False)
+        durl3 = await Durl('ftp://www.example.com/')
+        self.assertEqual(durl3, None)
+
+    async def test_durl_with_base(self):
+        durl1 = await Durl('https://www.example.com')
+        self.assertEqual(durl1.path, '/')
+        self.assertEqual(durl1.pwa(), '')
+        self.assertEqual(durl1.has_path(), False)
+        durl2 = await Durl('https://www.example.com/hello2', base=durl1)
+        self.assertEqual(durl2.hostname, 'www.example.com')
+        self.assertEqual(durl2.path, '/hello2')
+        self.assertEqual(durl2.pwa(), 'hello2')
+        durl3 = await Durl('/hello3?x=1', base=durl1)
+        self.assertEqual(durl3.hostname, 'www.example.com')
+        self.assertEqual(durl3.path, '/hello3')
+        self.assertEqual(durl3.pwa(), 'hello3?x=1')
+        self.assertEqual(durl3.site(), 'https://www.example.com/')
+        durl4 = await Durl('https://www.kernel.org/', base=durl1)
+        self.assertEqual(durl4, None)
+
+    async def test_durl_with_base_and_match_base(self):
+        durl1 = await Durl('https://www.example.com/base/path/')
+        self.assertEqual(durl1.path, '/base/path/')
+        self.assertEqual(durl1.pwa(), 'base/path/')
+        self.assertEqual(durl1.has_path(), True)
+        durl2 = await Durl(
+            'https://www.example.com/base/', base=durl1, match_base=True
+        )
+        self.assertEqual(durl2, None)
+        durl3 = await Durl(
+            'https://www.example.com/base/path/whatever?x=1#a',
+            base=durl1,
+            match_base=True,
+        )
+        self.assertEqual(durl3.pwa(), 'whatever?x=1')
+
+    async def asyncTearDown(self):
+        await self.pool.pool.release(self.conn)
+        await self.pool.pool.close()
--- a/tests/page.py
+++ b/tests/page.py
@ -0,0 +1,24 @@
+"""
+Test cases for resource type page.
+"""
+
+from unittest import TestCase
+from atextcrawler.utils.html import clean_body
+
+# from atextcrawler.utils.tag import drop_tags
+
+
+class PageCleanTest(TestCase):
+    def test_clean_body_1(self):
+        s = ' <em>Hello</em> <strong>world</strong> '
+        r = '<em>Hello</em> <strong>world</strong>'
+        self.assertEqual(clean_body(s), r)
+
+
+#    def test_drop_tags(self):
+#        s = '<figure what="ever">something<figure>else</figure>...</figure>'
+#        r = drop_tags(s)
+#        self.assertEqual(r, '')
+#        s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
+#        r = drop_tags(s)
+#        self.assertEqual(r, '')
--- a/tests/section.py
+++ b/tests/section.py
@ -0,0 +1,105 @@
+from unittest import TestCase
+
+from atextcrawler.utils.section import concat_section_texts, iter_sections
+
+
+class IterSectionTest(TestCase):
+    def test_iter_sections_1(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 80, 5: 2, 15: 1, 20: 3}
+        sections1 = list(iter_sections(s, sb, max_level=100))
+        sections2 = [
+            (0, 5, 80, 'bcde'),
+            (5, 15, 2, 'ghijklmno'),
+            (15, 20, 1, 'qrst'),
+            (20, 26, 3, 'uvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_iter_sections_2(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
+        sections1 = list(iter_sections(s, sb, max_level=100))
+        sections2 = [
+            (0, 5, 4, 'bcde'),
+            (5, 15, 2, 'ghijklmno'),
+            (15, 20, 1, 'qrst'),
+            (20, 26, 3, 'vwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_iter_sections_3(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {5: 2, 15: 60, 18: 50, 20: 3}
+        sections1 = list(iter_sections(s, sb, max_level=59))
+        sections2 = [
+            (0, 5, 80, 'bcde'),
+            (5, 18, 2, 'ghijklmnopqr'),
+            (18, 20, 50, 't'),
+            (20, 26, 3, 'uvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_iter_sections_4(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
+        sections1 = list(iter_sections(s, sb, max_level=59))
+        sections2 = [
+            (0, 5, 80, 'bcde'),
+            (5, 18, 2, 'ghijklmnopqr'),
+            (18, 20, 50, 't'),
+            (20, 26, 3, 'uvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+
+class AggSectionTest(TestCase):
+    def test_concat_sections_1(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 1, 5: 1, 15: 1, 20: 1}
+        sections1 = list(concat_section_texts(s, sb, min_len=10))
+        sections2 = [
+            ([0, 1], 'abcdefghijklmno'),
+            ([2, 3], 'pqrstuvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_concat_sections_2(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
+        sections1 = list(concat_section_texts(s, sb, min_len=10))
+        sections2 = [
+            ([0, 1], 'abcdefghij'),
+            ([2, 3, 4], 'klmnopqrstuvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_concat_sections_3(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
+        sections1 = list(concat_section_texts(s, sb, min_len=10))
+        sections2 = [
+            ([0, 1, 2], 'abcdefghijklmnop'),
+            ([3, 4], 'qrstuvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_concat_sections_4(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 1, 5: 1, 15: 1, 26: 1}
+        sections1 = list(concat_section_texts(s, sb, min_len=10))
+        sections2 = [
+            ([0, 1], 'abcdefghijklmno'),
+            ([2, 3], 'pqrstuvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
+
+    def test_concat_sections_5(self):
+        s = 'abcdefghijklmnopqrstuvwxyz'
+        sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
+        sections1 = list(concat_section_texts(s, sb, min_len=10))
+        sections2 = [
+            ([0, 1], 'abcdefghijkl'),
+            ([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
+        ]
+        self.assertEqual(sections1, sections2)
--- a/tests/simhash.py
+++ b/tests/simhash.py
@ -0,0 +1,54 @@
+"""
+Test cases for text util.
+"""
+
+from unittest import TestCase
+from simhash import Simhash, SimhashIndex
+from atextcrawler.utils.similarity import (
+    create_simhash,
+    get_features,
+    get_simhash,
+    postgresql_bigint_offset,
+    search_simhash,
+)
+
+
+class SimhashTest(TestCase):
+    """
+    Test simhash creation and search.
+    """
+
+    def test_search(self):
+        n1 = int('1111111100000000', 2)
+        n2 = int('1111111100000111', 2)
+        n3 = int('1000000000000000', 2)
+        n4 = int('1000000000000111', 2)
+        n5 = int('1000001111000000', 2)
+        objs = [
+            ('1', Simhash(n1)),
+            ('3', Simhash(n3)),
+            ('4', Simhash(n4)),
+        ]
+        index = SimhashIndex(objs, k=3)
+        found = search_simhash(index, Simhash(n5))
+        self.assertEqual(found, [])
+        found = search_simhash(index, Simhash(n1))
+        self.assertEqual(found, [1])
+        found = search_simhash(index, Simhash(n2))
+        self.assertEqual(found, [1])
+        found = search_simhash(index, Simhash(n4))
+        self.assertEqual(found, [3, 4])
+
+    def test_create(self):
+        index = SimhashIndex([], k=3)
+        hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
+        hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
+        simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
+        simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
+        found = search_simhash(index, simhash_1)
+        self.assertEqual(found, [101])
+        found = search_simhash(index, simhash_2)
+        self.assertEqual(found, [102])
+        simhash_3 = get_simhash('hello ' * 20 + 'X')
+        found = search_simhash(index, simhash_3)
+        self.assertEqual(found, [101])
--- a/tests/text.py
+++ b/tests/text.py
@ -0,0 +1,65 @@
+"""
+Test cases for text util.
+"""
+
+from unittest import TestCase
+from atextcrawler.utils.html import clean_page
+
+
+class CleanHtmlTest(TestCase):
+    """
+    Test clean_page.
+
+    Have an eye on self-closing tags (br, hr, ...).
+    """
+
+    def test_clean_page_1(self):
+        s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
+        r = '<em>Hello</em><br/>anything'
+        self.assertEqual(str(clean_page(s)), r)
+
+    def test_clean_page_2(self):
+        s = '<em>Hello</em><br /><script>malicious<script></script>anything'
+        r = '<em>Hello</em><br/>anything'
+        self.assertEqual(str(clean_page(s)), r)
+
+    def test_clean_page_3(self):
+        # nesting
+        s = '--<figure>xx<figure>yy</figure>zz</figure>..'
+        r = '--..'
+        self.assertEqual(str(clean_page(s)), r)
+
+    def test_clean_page_4(self):
+        # aria-hidden
+        s = '--<p aria-hidden=true>xx</p>..'
+        r = '--..'
+        self.assertEqual(str(clean_page(s)), r)
+        s = '--<p aria-hidden="true">xx</p>..'
+        r = '--..'
+        self.assertEqual(str(clean_page(s)), r)
+        s = '--<p aria-hidden=false>xx</p>..'
+        r = '--<p aria-hidden="false">xx</p>..'
+        self.assertEqual(str(clean_page(s)), r)
+        s = '--<p aria-hidden="false">xx</p>..'
+        r = '--<p aria-hidden="false">xx</p>..'
+        self.assertEqual(str(clean_page(s)), r)
+        s = '--<p aria-hidden=??>xx</p>..'
+        r = '--<p aria-hidden="??">xx</p>..'
+        self.assertEqual(str(clean_page(s)), r)
+
+    def test_clean_page_5(self):
+        # no removal
+        s = '--<p>xx<em>yy</em></p>..'
+        r = '--<p>xx<em>yy</em></p>..'
+        self.assertEqual(str(clean_page(s)), r)
+
+    def test_clean_page_6(self):
+        # self-closing tags to be removed
+        s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
+        r = '--<p>xx</p>\n...<h1>tt</h1>nn'
+        self.assertEqual(str(clean_page(s)), r)
+
+    def test_clean_page_7(self):
+        s = '--<p rel=search>tt<area /></p>nn'
+        r = '--nn'
+        self.assertEqual(str(clean_page(s)), r)