Put under version control

2021-11-29 09:16:31 +00:00 · 2021-11-29 09:16:31 +00:00 · a6af5b12d2
commit a6af5b12d2
parent d26d23348b
83 changed files with 20130 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,51 @@
 # Backup files
 *.~
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 # C extensions
 *.so
 # Distribution / packaging
 bin/
 build/
 develop-eggs/
 dist/
 eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 NOTES
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 .tox/
 .coverage
 .cache
 nosetests.xml
 coverage.xml
 htmlcov
 # Translations
 *.mo
 # mypy cache
 .mypy_cache
 # Sphinx documentation
 doc/build/
 doc/source/reference/
 # tmp dir
 tmp/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,30 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.0.1
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
    -   id: check-yaml
    -   id: check-added-large-files
 -   repo: https://github.com/psf/black
    rev: 21.11b1
    hooks:
    -   id: black
 -   repo: https://github.com/timothycrosley/isort
    rev: 5.10.1
    hooks:
    -   id: isort
        args: ["--profile", "black", "--filter-files", "-l", "79"]
 -   repo: https://github.com/myint/autoflake
    rev: v1.4
    hooks:
      - id: autoflake
        args:
          [
            "--in-place",
            "--remove-all-unused-imports",
            "--ignore-init-module-imports",
            "--remove-unused-variables",
          ]
--- a/46
+++ b/46
@ -0,0 +1,46 @@
 [[source]]
 url = "https://pypi.org/simple"
 verify_ssl = true
 name = "pypi"
 [packages]
 aiohttp = "*"
 async-lru = "*"
 asyncpg = "*"
 beautifulsoup4 = "*"
 elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] }
 elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" }
 feedparser = "*"
 gcld3 = "*"
 # TODO: recheck
 pypandoc = "*"
 pytidylib = "*"
 pytz = "*"
 pyyaml = "*"
 tika = "*"
 tldextract = "*"
 voluptuous = "*"
 simhash = "*"
 async-dns = "*"
 types-pyyaml = "*"
 sphinx-rtd-theme = "*"
 [dev-packages]
 mypy = "*"
 pre-commit = "*"
 sphinx = "*"
 myst-parser = "*"
 isort = "*"
 blacken-docs = "*"
 pybetter = "*"
 interrogate = "*"
 autoflake = "*"
 types-pyyaml = "*"
 types-pytz = "*"
 black = "*"
 [requires]
 python_version = "3.9"
 [pipenv]
 allow_prereleases = true
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/README.md
+++ b/README.md
@ -0,0 +1,13 @@
 atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search.
 Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch)
 atextcrawler crawls and indexes selected websites.
 It starts from a few seed sites and follows their external links.
 Criteria defined in plugin code determine which linked sites (and
 which of their resources) are (recursively) added to the pool.
 atextcrawler is written in Python, runs a configurable number of
 async workers concurrently (in one process), uses tensorflow for
 embedding (paragraph-sized) text chunks in a (multi-)language model
 and stores metadata in PostgreSQL and texts in elasticsearch.
--- a/doc/Makefile
+++ b/doc/Makefile
@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -0,0 +1,71 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # This file only contains a selection of the most common options. For a full
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Path setup --------------------------------------------------------------
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 # import os
 # import sys
 # sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
 import os
 import sys
 proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
 sys.path.insert(0, proj_dir + '/src')
 # -- Project information -----------------------------------------------------
 project = 'atextcrawler'
 copyright = '2021, ibu radempa'
 author = 'ibu radempa'
 # The full version, including alpha/beta/rc tags
 release = '0.1.0'
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.autosummary',
    'myst_parser',
    'sphinx.ext.graphviz',
 ]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = []
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
 autosummary_generate = True
 source_suffix = {
    '.rst': 'restructuredtext',
    '.md': 'markdown',
 }
--- a/doc/source/config_template/initial_data/seed_urls.list
+++ b/doc/source/config_template/initial_data/seed_urls.list
@ -0,0 +1,23 @@
 # Initial URLs (first run only)
 #
 # To whitelist a URL prepend '+', to blacklist prepend '-'.
 # Comment lines must begin with '#'.
 # de
 +http://agd.blogsport.de/
 +https://blackblogs.org/blogs/
 +https://fau.org/
 +http://anarchiv.de/
 +http://olaf.bbm.de/die-aktion
 -https://www.anarchistischefoderation.de/
 # en
 +https://anarchistarchivist.com/
 +https://bookshelf.theanarchistlibrary.org/library/
 +https://archive.elephanteditions.net/library/
 +https://blackrosefed.org/
 +https://alpineanarchist.org/
 +https://nostate.net/
 +https://abolishing.blackblogs.org/
 +http://library.nothingness.org/
 -https://www.anarchistfederation.net/
--- a/doc/source/config_template/main.yaml
+++ b/doc/source/config_template/main.yaml
@ -0,0 +1,88 @@
 # Name of this instance
 # Default value: atextcrawler
 # Allowed values: arbitrary string
 instance_name: atextcrawler
 # Which kind of instance is this?
 # Default value: prod
 # Allowed values are:
 # - 'dev': development instance
 # - 'staging': staging instance
 # - 'prod': production instance
 instance_type: prod
 # Log level
 # Default value: info
 # Allowed values: critical, error, warning, info, debug
 log_level: info
 # Plugins directory
 # If given as relative path, it will be relative to the
 # directory of this file (main.yaml).
 # Read documentation on plugins.
 # Default value: plugins
 # Hint: Create a empty __init__.py in the plugins_dir.
 plugins_dir: plugins
 # Parameters for access to the PostgreSQL service
 # No default values; must be set.
 postgresql:
    host: localhost
    port: 5432
    database: atextcrawler
    user: atextcrawler
    password: ________________________
 # Crawling
 crawl:
    # Number of concurrent workers
    # Default value: 10
    # Allowed values: integer >=0 and <=1000
    #workers: 3
    # Delay in seconds between attempts to fetch items
    # from site_queue if the last attempt gave no item
    # Also the delay in seconds after a worker has found
    # no site to process
    # Default value: 600
    # Allowed values: positive number
    #site_delay: 10
    # Time interval in seconds between site updates when
    # handling queued base URLs
    # Default value: 3600
    # Allowed values: positive number
    #site_revisit_interval: 3600
    # Delay in seconds between attempts to process
    # individual resources (pages etc.) of a site
    # Default value: 5
    # Allowed values: positive number
    #resource_delay: 3
    # Default interval in seconds between full crawls of a site
    # Default value: 864000 (10 days)
    # Allowed values: positive number
    #full_crawl_interval: 864000
    # Default interval in seconds between feed crawls of a site
    # Default value: 86400 (1 day)
    # Allowed values: positive number
    #feed_crawl_interval: 86400
 # Parameters for access to the ElasticSearch service
 # No default values; must be set.
 elasticsearch:
    # host on which ES is running
    host: localhost
    # API key for accessing ES
    api_key: "**********************"
    # API user id
    id: "**********************"
    # Index base name (actual index names will have '_text' etc. appended)
    index_base_name: atext
 # Tensorflow access
 tensorflow:
    # The prediction endpoint of the model server's sentence model
    model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
--- a/doc/source/config_template/plugins/init.py
+++ b/doc/source/config_template/plugins/init.py
--- a/doc/source/config_template/plugins/filter_resource_path.py
+++ b/doc/source/config_template/plugins/filter_resource_path.py
@ -0,0 +1,22 @@
 """
 Filter paths found in a resource.
 This plugin implements :func:`rp_filter`.
 """
 from typing import Optional
 def rp_filter(site, durl) -> Optional[str]:
    """
    Adjust or filter found paths (may depend on site).
    To filter out a path (i.e., not add it to table `site_path`)
    return None.
    """
    path = durl.pwa()
    # skip fetching images (linked from a tags; img tags are skipped anyway)
    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
        return None
    path = path.removesuffix('?amp=1')
    return path
--- a/doc/source/config_template/plugins/filter_site.py
+++ b/doc/source/config_template/plugins/filter_site.py
@ -0,0 +1,47 @@
 """
 Relevance estimation of sites.
 This plugin implements :func:`site_filter`.
 """
 import re
 from atextcrawler.models import Site
 MIN_RELEVANCE_SCORE = 5
 async def site_filter(site: Site) -> bool:
    """
    Assess relevance of the site (using language-dependent criteria).
    If the site shall be crawled, return True, else False.
    """
    # limit to sites in English or German language
    if not set(['de', 'en']) & set(site.langs):
        return False
    score = 0.0
    for crit_name, weight, langs, crit_re in re_criteria:
        if '*' in langs or set(langs) & set(site.langs):
            findings = crit_re.findall(site.startpage_text)
            if findings:
                score += weight * len(findings)
            if site.title and crit_re.search(site.title):
                score += 4 * weight
            if site.description and crit_re.search(site.description):
                score += 4 * weight
    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
    return score >= MIN_RELEVANCE_SCORE
 re_criteria = {
    (
        'anarch',
        1.0,
        ('*',),
        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
    ),
    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
 }
--- a/doc/source/config_template/plugins/filter_site_path.py
+++ b/doc/source/config_template/plugins/filter_site_path.py
@ -0,0 +1,24 @@
 """
 Plugin for filtering paths of a site to be retrieved.
 This plugin implements :func:`sp_filter`.
 """
 def sp_filter(site, path, robots) -> bool:
    """
    Per-site path filter. Return whether the path shall be retrieved.
    """
    if not robots.can_fetch_url(site.base_url + path):
        return False
    if 'amusewiki' in site.meta_info.get('generator', '').lower():
        if any(
            [
                path.endswith(end)
                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
            ]
        ):
            return False
        if '/bbselect?' in path:
            return False
    return True
--- a/doc/source/devel/devel.md
+++ b/doc/source/devel/devel.md
@ -0,0 +1,63 @@
 ## Setup dev environment
 1. You need python 3.9 or later.
 1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
 1. Clone the repo and setup a virtualenv:
 ```
 cd YOUR_DEV_DIR
 git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
 cd atextcrawler
 pipenv install -d
 ```
 ## Configure the instance
 See [installation](installation.md).
 ## Run
 ```
 python -m atextcrawler
 ```
 ## Logging
 Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
 ```
 journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
 ```
 ## Upgrading
 Upgrade dev tools:
 ```
 pre-commit autoupdate
 ```
 ## Test and clean manually
 ```
 AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
 mypy --ignore-missing-imports src/atextcrawler
 isort src/atextcrawler
 black -S -t py37 -l 79 src/atextcrawler
 pybetter --exclude B004,B007,B008 src/atextcrawler
 interrogate -i -I -m -v src/atextcrawler
 ```
 ## Release
 There are no releases (currently).
 ## Useful commands
 ### Fetch a resource or a site manually
 ```
 python -m atextcrawler.resource https://www.katesharpleylibrary.net/
 python -m atextcrawler.site https://www.katesharpleylibrary.net/
 ```
 ### SQL
 ```
 drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
 http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
 http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
 -- stats: sites, paths, resources
 select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
 ```
--- a/doc/source/devel/related_work.md
+++ b/doc/source/devel/related_work.md
@ -0,0 +1,64 @@
 ## Related work
 * [collection of crawlers](https://github.com/adbar/awesome-crawler)
 * [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
 ### crawlers
 * [acrawler](https://acrawler.readthedocs.io/en/latest/)
 * [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
  * [repo](https://github.com/adbar/trafilatura)
  * [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
 * [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
 * [scrapy](https://docs.scrapy.org/en/latest/)
 * [heritrix3](https://github.com/internetarchive/heritrix3/)
 * [YaCy](https://yacy.net/)
 * [searchmysite](https://searchmysite.net/)
 * [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
 * [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
 * https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
 * [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
 #### general
 * [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
 ### sitemap parsers
 * [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
 ### url handling
 * [courlan](https://pypi.org/project/courlan/)
 ### language detection
 * [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
 * [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
 * [guess_language](https://pypi.org/project/guess-language/)
 * [cld3](https://github.com/google/cld3)
 ### text extraction
 * [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
 ### deduplication
 * [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
 * [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
 * remove paragraphs with more than 50% word-7-tuples encountered previously
 ### Extract more meta tags
 * https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
  https://support.shareaholic.com/hc/en-us/articles/115003085186
 ### Date parsing dependent on language
 * https://en.wikipedia.org/wiki/Date_format_by_country
 * https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
 * https://pypi.org/project/dateparser/
 * https://github.com/ovalhub/pyicu
 * https://github.com/night-crawler/cldr-language-helpers
 * https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
 ICU
 * https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
 * https://gist.github.com/dpk/8325992
 * https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
 * https://unicode-org.github.io/icu/userguide/
 * https://unicode-org.github.io/icu-docs/#/icu4c/
 * https://github.com/ovalhub/pyicu/blob/master/samples/break.py
 * https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
 * https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
 * https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
--- a/doc/source/devel/todo.md
+++ b/doc/source/devel/todo.md
@ -0,0 +1,77 @@
 ## TODO
 * parse html time tags
 * site annotations:
  * categories
    * historical (no changes any more since n months)
    * news
  * local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
 * allow for tls in elasticsearch config
 * replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
 ```
        '&#8211;': '--',
        '&ndash;': '--',
        '–': '--',
        '&#8212;': '---',
        '&mdash;': '---',
        '—': '---',
        '&#8230;': '...',
        '&hellip;': '...',
        '…': '...',
        '&#8220;': '"',
        '&#8221;': '"',
        '&#8222;': '"',
        '&#8243;': '"',
        '&ldquo;': '"',
        '&rdquo;': '"',
        '&bdquo;': '"',
        '&Prime;': '"',
        '“':'"',
        '”':'"',
        '„':'"',
        '″':'"',
        '&#8216;':"'",
        '&#8217;':"'",
        '&#8242;':"'",
        '&lsquo;':"'",
        '&rsquo;':"'",
        '&prime;':"'",
        '‘':"'",
        '’':"'",
        '′':"'",
 ```
 * normalize quotation marks and punctuation in general
  * https://unicode-table.com/en/sets/quotation-marks/
  * https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
  * https://www.fileformat.info/info/unicode/category/Po/list.htm
  * https://www.gaijin.at/en/infos/unicode-character-table-punctuation
  * ⁝
 * cancel crawls that take too long
 * search for "TODO" in code
 * feedparser has support for JSON feeds since commit
  a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
  (as of 2020-10-26 in "develop" branch, not part of a release)
  the version names are 'json1' and 'json11'
 * allow site URLs with path, e.g.
  https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
 * add more languages
 ## Ideas
 * use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
 * [space-langdetect](https://pypi.org/project/spacy-langdetect/)
 * [langid.py](https://github.com/saffsd/langid.py)
 * [gain](https://github.com/gaojiuli/gain)
 * [ruia](https://docs.python-ruia.org/)
 * [demiurge](https://demiurge.readthedocs.io/)
 * [cocrawler](https://github.com/cocrawler/cocrawler/)
 * [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
--- a/doc/source/development.rst
+++ b/doc/source/development.rst
@ -0,0 +1,9 @@
 Development
 -----------
 .. toctree::
    :maxdepth: 2
    devel/devel
    devel/todo
    devel/related_work
--- a/doc/source/elasticsearch.md
+++ b/doc/source/elasticsearch.md
@ -0,0 +1,119 @@
 # Howto elasticsearch
 ## Prerequisites
 On the host (virtualization host) we need:
 ```
 # cat /etc/sysctl.d/virtual_memory.conf
 vm.max_map_count=262144
 # sysctl -p /etc/sysctl.d/virtual_memory.conf
 ```
 If this cannot be done, change this file after installing or upgrading elasticsearch:
 ```
 /usr/lib/sysctl.d/elasticsearch.conf
 ```
 ## Setup
 ### Install package
 In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
 We do a manual install. If you configure the apt repo instead, also think about setting
 `RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
 ```
 wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
 wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
 shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
 dpkg -i elasticsearch-7.15.2-amd64.deb
 systemctl daemon-reload
 systemctl enable elasticsearch.service
 systemctl start elasticsearch.service
 ```
 First test:
 ```
 http -j GET 127.0.0.1:9200/
 ```
 ### Storage
 ```
 systemctl stop elasticsearch.service
 mv /var/lib/elasticsearch/ /srv/
 systemctl start elasticsearch.service
 ```
 Edit /etc/elasticsearch/elasticsearch.yml
 ```
 cluster.name: org.a-text.search
 node.name: atext1
 path.data: /srv/elasticsearch
 path.logs: /var/log/elasticsearch
 discovery.seed_hosts: ["atext1.multiname.org"]
 xpack.security.enabled: true
 xpack.security.authc.api_key.enabled: true
 ```
 ```
 systemctl restart elasticsearch
 ```
 The logfile now is at
 ```
 /var/log/elasticsearch/org.a-text.search.log
 ```
 ### Setup passwords
 Setup passwords:
 ```
 # /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
 Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
 The passwords will be randomly generated and printed to the console.
 Please confirm that you would like to continue [y/N]y
 ```
 Copy output to /etc/elasticsearch/passwords and
 ```
 chmod 400 /etc/elasticsearch/passwords
 ```
 Check login as user elastic:
 ```
 http --auth elastic:************** -j GET http://127.0.0.1:9200/
 ```
 ### Memory limitation
 To limit memory usage
 ```
 mkdir /etc/systemd/system/elasticsearch.service.d
 cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
 [Service]
 LimitMEMLOCK=8G
 systemctl stop elasticsearch
 systemctl daemon-reload
 systemctl start elasticsearch
 EOF
 ```
 and restart the service.
 ## Usage
 Some useful requests:
 ### List indices
 ```
 http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
 ```
 ### Health
 ```
 http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
 ```
 ### Node attributes
 ```
 http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
 ```
 ### Create API key
 ```
 http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
 ```
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -0,0 +1,37 @@
 atextcrawler
 ============
 atextcrawler is an asynchronous webcrawler indexing text
 for literal and semantic search.
 Its client-side counterpart is atextsearch_.
 atextcrawler crawls and indexes selected websites.
 It starts from a few seed sites and follows their external links.
 Criteria defined in plugin code determine which linked sites (and 
 which of their resources) are (recursively) added to the pool.
 atextcrawler is written in Python, runs a configurable number of
 async workers concurrently (in one process), uses tensorflow for
 embedding (paragraph-sized) text chunks in a (multi-)language model
 and stores metadata in PostgreSQL and texts in elasticsearch.
 .. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
 .. toctree::
   :maxdepth: 2
   :caption: Contents:
   introduction
   installation
   maintenance
   development
   reference/modules
 Indices and tables
 ==================
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
--- a/doc/source/installation.md
+++ b/doc/source/installation.md
@ -0,0 +1,122 @@
 # Installation
 Installation was only tested on Debian bullseye (on amd64).
 The instructions below are for this system.
 (Please adapt to other environments.)
 ## System packages
 ```
 apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
 ```
 The protobuf packages are required for python package gcld3 (see below).
 ## PostgreSQL database
 We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
 ```
 createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
 ```
 ## Elasticsearch
 We need access to an elasticsearch instance (over TCP/IP).
 Note: TLS is not yet supported, so install this service locally.
 See [elasticsearch howto](elasticsearch.md).
 ## Tensorflow model server
 We need access to a tensorflow model server (over TCP/IP).
 It should serve `universal_sentence_encoder_multilingual`
 or a similar language model.
 Note: TLS is not yet supported, so install this service locally.
 See [tensorflow howto](tensorflow_model_server.md).
 ## Setup virtualenv and install atextcrawler
 ```
 apt install python3-pip
 adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
 su - atextcrawler
 cat >>.bashrc <<EOF
 export PYTHONPATH=\$HOME/repo/src
 EOF
 pip3 install --user pipenv
 cat >>.profile <<EOF
 PYTHONPATH=\$HOME/repo/src
 PATH=\$HOME/.local/bin:$PATH
 \$HOME/.local/bin/pipenv shell
 EOF
 exit
 su - atextcrawler
 git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
 cd repo
 pipenv sync
 pipenv install --site-packages  # for systemd
 pre-commit install
 ```
 Note: One of the dependencies, Python package `tldextract`,
 uses this directory for caching:
 ```
 $HOME/.cache/python-tldextract/
 ```
 ## Configure atextcrawler
 As user `atextcrawler` execute
 ```
 mkdir $HOME/.config
 cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
 ```
 Edit `$HOME/.config/atextcrawler/main.yaml`.
 If you want to override a plugin, copy it to the plugins directory
 and edit it, e.g.
 ```
 cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
 ```
 Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
 Check (and print) the instance configuration:
 ```
 python -m atextcrawler.config
 ```
 ## Test run
 To see if it works, run `atextcrawler` from the command line:
 ```
 python -m atextcrawler
 ```
 You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
 ## Install systemd service
 To make the service persistent, create a systemd unit file
 `/etc/systemd/system/atextcrawler.service` with this content:
 ```
 [Unit]
 Description=atextcrawler web crawler
 Documentation=https://gitea.multiname.org/a-text/atextcrawler
 Requires=network.target
 After=network-online.target
 [Service]
 Type=simple
 User=atextcrawler
 Group=atextcrawler
 WorkingDirectory=/srv/atextcrawler/repo
 Environment=PYTHONPATH=/srv/atextcrawler/repo/src
 ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
 TimeoutStartSec=30
 ExecStop=/bin/kill -INT $MAINPID
 TimeoutStopSec=180
 Restart=on-failure
 [Install]
 WantedBy=multi-user.target
 ```
 and
 ```
 systemctl daemon-reload
 systemctl enable atextcrawler
 systemctl start atextcrawler
 ```
--- a/doc/source/introduction.md
+++ b/doc/source/introduction.md
@ -0,0 +1,66 @@
 # Introduction
 ## What atextcrawler does:
 * Start from a seed (white+black-)list of website base URLs
 * Loop over sites selected by applying criteria to the content
  of the site's start page
 * Crawl the site, i.e. loop over resources of the site
 * Extract plaintext content from the resource (html parsing is
  optimized for html5); discard non-text content, but handle feeds
  and sitemaps
 * Extract internal and external links; external links contribute
  to the site list
 * Keep track of the sites and resources in a PostgreSQL database
 * Store plaintext content of resources in an Elasticsearch index
 * Store vector embeddings of plaintexts also in Elasticsearch
  using tensorflow model server with a multilingual language model
 ## Architecture
 There is only one python process running concurrently.
 We use asyncio where possible (almost everywhere).
 1. There is a queue of websites, see database table `site_queue`.
   The queue is fed a) on first startup with seeds, b) manually
   and c) from crawls which find external links.
   When the queued is handled new sites are stored to table `site`.
   New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
   After the queue has been handled there is a delay
   (`crawl.site_delay` seconds) before repetition.
 1. Updating a site means: the start page is fetched and
   criteria are applied to its content to determine whether
   the site is relevant. (It is assumed that (non-)relevance is
   obvious from the start page already.) If the site is relevant,
   more information is fetched (e.g. sitemaps).
 1. There is s a configurable number of crawler workers (config
   `crawl.workers`) which concurrently crawl sites, one at a time
   per worker. (During the crawl the site is marked as locked using
   crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
   Each crawl (with begin time, end time, number of found (new)
   resources)) is stored in table `crawl`.
 1. Crawls are either full crawls (including all paths reachable
   through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
   Feed crawls can happen more frequently (e.g. daily).
 1. When a path is fetched it can result in a MetaResource (feed or
   sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
 1. If a MetaResource is fetched and it is a sitemap, its paths are
   added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
 1. Links between sites are stored in table `site_link`.
 ## Site annotations
 Database table `site_annotation` can have any number of annotations
 for a base_url. While crawling, these annotations are considered:
 Blacklisting or whitelisting has precedence over function `site_filter`
 (in plugin `filter_site`).
 Annotations cannot be managed from within atextcrawler;
 this requires another application, usually [`atextsearch`](https://TODO).
 Each annotation requires a base_url of the annotated site and
 if a site with this base_url exists in the `site` table,
 it should also be associated with the site's id (column `site_id`).
 ## Limitations
 * atextcrawler is not optimized for speed; it is meant to be run as a
  background task on a server with limited resources
  (or even an SBC, like raspberry pi, with attached storage)
 * atextcrawler only indexes text, no other resources like images
--- a/doc/source/maintenance.md
+++ b/doc/source/maintenance.md
@ -0,0 +1,23 @@
 # Maintenance
 ## Upgrading
 ```
 su - atextcrawler
 pip3 install --user --upgrade pipenv
 cd repo
 git pull
 pipenv sync
 systemctl restart atextcrawler
 ```
 ## Update tldextract
 From time to time run (in the Python virtualenv):
 ```
 tldextract --update
 ```
 or
 ```
 systemctl stop atextcrawler
 rm -r $HOME/.cache/python-tldextract
 systemctl start atextcrawler
 ```
--- a/doc/source/tensorflow_model_server.md
+++ b/doc/source/tensorflow_model_server.md
@ -0,0 +1,98 @@
 # Tensorflow model server
 ## Setup server
 Prepare:
 ```
 apt install gnupg2
 ```
 Add repo:
 ```
 echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
 curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
 ```
 Install package:
 ```
 apt update
 apt install tensorflow-model-server
 ```
 ## Setup models
 ```
 mkdir -p /srv/tensorflow/workdir
 mkdir -p /srv/tensorflow/models
 ```
 Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
 ```
 # example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
 mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
 cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
 wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
 tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
 rm universal-sentence-encoder-multilingual_3.tar.gz
 ```
 Check:
 ```
 tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
 ```
 Config file `/srv/tensorflow/config`:
 ```
 model_config_list: {
  config: {
    name: "sentences",
    base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
    model_platform: "tensorflow"
    model_version_policy: {latest{}},
  },
  config: {
    ... (next model)
  },
 }
 ```
 ## Systemd integration
 Edit /etc/systemd/system/tensorflow.service
 ```
 [Unit]
 Description=tensorflow model server
 After=network.target auditd.service
 [Service]
 Type=simple
 WorkingDirectory=/srv/tensorflow/workdir
 ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
 KillMode=process
 Restart=on-failure
 RestartSec=30s
 [Install]
 WantedBy=multi-user.target
 ```
 and
 ```
 systemctl daemon-reload
 systemctl enable tensorflow
 systemctl start tensorflow
 ```
 Check:
 ```
 http -j GET http://localhost:9000/v1/models/sentences
 ```
 ## Usage
 Show model details:
 ```
 http -j GET http://localhost:9000/v1/models/sentences/metadata
 ```
 ## Docs
 * `/usr/bin/tensorflow_model_server --help`
 * https://github.com/tensorflow/serving/
 * [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
 * https://github.com/hey-car/tensorflow-model-server
 Datasets:
 * https://www.tensorflow.org/datasets/catalog/overview
--- a/license.txt
+++ b/license.txt
@ -0,0 +1,48 @@
 ANTI-AUTHORITARIAN LICENSE version 1.0
 ________________________________________________________________________________
 Obviously, this license is relevant to all who are bound by law.
 The licensee ("you") must not be a commercial, military, clerical or
 governmental entity. For this license the term "software" means the program
 code, documentation as well as other data (for instance, language files).
 Subject to the respective terms and conditions described below the licensee
 is granted the non-exclusive and non-transferable license to:
 A. make copies of the software
 B. create derivative works ("modifications")
 C. install and run copies or modifications of the software on any number of
   servers, thereby making them usable for the licensee and possibly others
 D. offer or give copies or modifications of the software, or parts of the
   unmodified or modified software to others
 For these permissions the respective conditions stated below must be met:
 * For permission A condition 1 must be met.
 * For permission B all of the conditions 1, 3, 4 must be met.
 * For permission C all of the conditions 2, 3 must be met.
 * For permission D all of the conditions 1, 2, 3, 4, 5 must be met.
 These are the conditions:
 1. You include this copyright notice and license in any copy or modification.
   In files that contain a reference to it you preserve this reference.
 2. You do not use this software or any modification of it for any commercial
   purpose or for monetary gain, and also not for any military, governmental
   or religious purpose; here with commercial purpose we mean activities which
   have among their goals to make profit, be it monetary profit or any other
   kind of profit that may entail or contribute to monetary profit.
 3. Demos or screenshots of the modified or unmodified software must not be
   published in any medium which requires the viewers to pay money in order
   to see the contents; here money paid for mere internet connectivity (i.e.,
   independent of the content supplier) is to be disregarded.
 4. You do not impose any further restrictions on this software or any
   derivative works beyond those restrictions herein.
 5. The copy or modification must include source code, and must allow
   distribution in source code as well as compiled form. The source code
   must be the preferred form in which a programmer would modify the program.
   Deliberately obfuscated source code is not allowed. Intermediate forms
   such as the output of a preprocessor or translator are not allowed.
 For this license itself, if re-used for other software, the following
 copyright and license applies (copyheart license):
 ♡ Copying is an act of love. Please copy.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,10 @@
 # TOML formatted file; see PEP 518
 [tool.isort]
 profile = "black"
 #multi_line_output = 3
 [tool.black]
 line-length = 79
 target_version = ['py39']
 skip-string-normalization = true
--- a/src/atextcrawler/init.py
+++ b/src/atextcrawler/init.py
--- a/src/atextcrawler/main.py
+++ b/src/atextcrawler/main.py
@ -0,0 +1,12 @@
 """
 atextcrawler application execution entry point.
 """
 import asyncio
 from .application import Application
 from .config import Config
 if __name__ == '__main__':
    config = Config().get()
    asyncio.run(Application(config).run())
--- a/src/atextcrawler/application.py
+++ b/src/atextcrawler/application.py
@ -0,0 +1,204 @@
 """
 atextcrawler application.
 """
 import asyncio
 import importlib
 import logging
 import signal
 import sys
 from systemd.journal import JournalHandler
 from .config import Config
 from .crawl import CrawlWorker
 from .db import PGPool
 from .search import shutdown_engine, startup_engine
 from .site import load_seeds, process_site_queue
 plugin_names = ['filter_site', 'filter_site_path', 'filter_resource_path']
 class Application:
    """
    atextcrawler application.
    The basic structure of the application is this:
      * one site crawler works just on the site_queue: fetching start pages
        of sites and storing updated site information in table sites
      * N other CrawlWorkers each do this in a loop:
        checkout a site that is due for crawl and crawl its resources;
        they fill the site_queue
    """
    running = True
    def __init__(self, config=None):
        if config is None:
            config = Config().get()
        self.config = config
        self.instance_name = config['instance_name']
        self.instance_type = config['instance_type']
        log_level = getattr(
            logging, config['log_level'].upper(), logging.CRITICAL
        )
        self.logger = logging.getLogger('atextcrawler')
        self.logger.setLevel(log_level)
        if self.instance_type == 'dev':
            self.logger.addHandler(logging.StreamHandler())
        else:
            self.logger.addHandler(
                JournalHandler(SYSLOG_IDENTIFIER=self.instance_name)
            )
        self.logger.propagate = False
        self.channel = 'atextcrawler_' + self.config['instance_name']
        msg = f'Instance "{self}" initializing'
        self.logger.info(msg)
        self.plugins = self._load_plugins()
    def __str__(self):
        return self.instance_name
    def _load_plugins(self):
        """
        Return a dict mapping plugin names to modules.
        """
        modules = {}
        old_path = sys.path
        for name in plugin_names:
            try:
                plugins_dir = self.config['plugins_dir']
                sys.path.insert(0, plugins_dir)
                module = importlib.import_module(name)
                msg = f'Loading plugin "{name}" from {plugins_dir}'
            except:
                module = importlib.import_module(
                    'atextcrawler.plugin_defaults.' + name
                )
                msg = f'Loading plugin "{name}" from default location'
            self.logger.info(msg)
            modules[name] = module
        sys.path = old_path
        return modules
    async def run(self):
        """
        Application lifecycle.
        """
        await asyncio.gather(self.wait_for_shutdown(), self.startup())
        await self.shutdown()
    async def startup(self):
        """
        Asynchronous startup.
        """
        msg = f'Instance "{self}" starting components'
        self.logger.info(msg)
        self.search_engine = await startup_engine(self.config)
        self.pgpool = await PGPool(self.config['postgresql'])
        self.pool = self.pgpool.pool
        await load_seeds(self.config, self.pool)
        await reset_site_locks(self.pool)
        worker_count = self.config['crawl']['workers']
        self.workers = []
        for worker_number in range(worker_count):
            worker = await CrawlWorker(self, worker_number, self.pool)
            self.workers.append(worker)
        worker_coros = [worker.run() for worker in self.workers]
        await asyncio.gather(
            process_site_queue(self, self.pool),
            self.handle_notifications(),
            *worker_coros,
        )
    async def wait_for_shutdown(self):
        """
        Create a shutdown event (:class:`asyncio.Event`) and wait for it.
        The event will be set by a signal handler for SIGINT
        and SIGTERM signals (see :meth:`Application.handle_shutdown_signal`).
        """
        self.shutdown_event = asyncio.Event()
        for sig in (signal.SIGINT, signal.SIGTERM):
            asyncio.get_running_loop().add_signal_handler(
                sig, self.handle_shutdown_signal
            )
        self.logger.debug(f'{self} waiting for shutdown event')
        await self.shutdown_event.wait()
        self.logger.info(f'Instance "{self}" shutdown event')
    def handle_shutdown_signal(self):
        """
        Handle shutdown signal.
        """
        if self.shutdown_event.is_set():
            return
        self.shutdown_event.set()
        self.running = False
    async def shutdown(self):
        """
        Asynchronous shutdown.
        """
        self.logger.debug(f'Instance "{self}" shutting down')
        await self.notify_conn.remove_listener(
            self.channel, self.listen_callback
        )
        await self.pool.release(self.notify_conn)
        for worker in self.workers:
            await worker.shutdown()
        await shutdown_engine(self.search_engine)
        await self.pgpool.shutdown()
        self.logger.info(f'Instance "{self}" shutdown completed')
    async def handle_notifications(self):
        """
        Handle notifications using PostgreSQL's NOTIFY/LISTEN.
        """
        self.notify_conn = await self.pool.acquire()
        await self.notify_conn.add_listener(self.channel, self.listen_callback)
    def listen_callback(self, *args):
        """
        Handle notify event from PostgreSQL.
        """
        channel = args[2]
        if channel != self.channel:
            return
        message = args[3]
        if message.startswith('site_update '):
            try:
                site_id = int(message.removeprefix('site_update '))
                for worker in self.workers:
                    if worker.site and site_id == worker.site.id_:
                        msg = (
                            f'Cancelling worker {worker.worker_number}'
                            f' (site={site_id}) due to site_update'
                        )
                        self.logger.info(msg)
                        worker.running = False
            except:
                pass
    async def sleep(self, duration, t_slice=3):
        """
        Sleep for *duration* seconds while self.running.
        Check self.running every *t_slice* seconds.
        """
        remaining = duration
        while remaining > 0 and self.running:
            await asyncio.sleep(min(t_slice, remaining))
            remaining -= t_slice
 async def reset_site_locks(pool):
    """
    Remove locks leftover from last run: Set crawl_active=false for all sites.
    This is relevant when the application was not shutdown properly (e.g.
    when the process was killed).
    """
    async with pool.acquire() as conn:
        sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true"
        await conn.execute(sql)
--- a/src/atextcrawler/assets/iana_langs
+++ b/src/atextcrawler/assets/iana_langs
@ -0,0 +1,7 @@
 The recommended language tags to use in webpages are from
 the IANA Language Subtag Registry (BCP47), see:
 https://www.w3.org/International/questions/qa-html-language-declarations
 https://r12a.github.io/app-subtags/
 wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_  | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //'
--- a/src/atextcrawler/assets/iso_639-1
+++ b/src/atextcrawler/assets/iso_639-1
@ -0,0 +1,219 @@
 aa
 ab
 ae
 af
 ak
 am
 an
 ar
 as
 av
 ay
 az
 ba
 be
 bg
 bh
 bi
 bm
 bn
 bo
 br
 bs
 ca
 ca
 ce
 ch
 co
 cr
 cs
 cu
 cu
 cu
 cu
 cu
 cv
 cy
 da
 de
 dv
 dv
 dv
 dz
 ee
 el
 en
 eo
 es
 es
 et
 eu
 fa
 ff
 fi
 fj
 fo
 fr
 fy
 ga
 gd
 gd
 gl
 gn
 gu
 gv
 ha
 he
 hi
 ho
 hr
 ht
 ht
 hu
 hy
 hz
 ia
 id
 ie
 ie
 ig
 ii
 ii
 ik
 io
 is
 it
 iu
 ja
 jv
 ka
 kg
 ki
 ki
 kj
 kj
 kk
 kl
 kl
 km
 kn
 ko
 kr
 ks
 ku
 kv
 kw
 ky
 ky
 la
 lb
 lb
 lg
 li
 li
 li
 ln
 lo
 lt
 lu
 lv
 mg
 mh
 mi
 mk
 ml
 mn
 mr
 ms
 mt
 my
 na
 nb
 nb
 nd
 nd
 ne
 ng
 nl
 nl
 nn
 nn
 no
 nr
 nr
 nv
 nv
 ny
 ny
 ny
 oc
 oj
 om
 or
 os
 os
 pa
 pa
 pi
 pl
 ps
 ps
 pt
 qu
 rm
 rn
 ro
 ro
 ro
 ru
 rw
 sa
 sc
 sd
 se
 sg
 si
 si
 sk
 sl
 sm
 sn
 so
 sq
 sr
 ss
 st
 su
 sv
 sw
 ta
 te
 tg
 th
 ti
 tk
 tl
 tn
 to
 tr
 ts
 tt
 tw
 ty
 ug
 ug
 uk
 ur
 uz
 ve
 vi
 vo
 wa
 wo
 xh
 yi
 yo
 za
 za
 zh
 zu
--- a/src/atextcrawler/assets/top_1e4
+++ b/src/atextcrawler/assets/top_1e4
--- a/src/atextcrawler/config.py
+++ b/src/atextcrawler/config.py
@ -0,0 +1,337 @@
 """
 Configuration loader and validator.
 """
 import os
 import re
 import sys
 from io import TextIOBase
 from pathlib import Path
 from typing import Any, Optional, Union
 from voluptuous import All
 from voluptuous import Any as VAny
 from voluptuous import Invalid, Length, Range, Required, Schema, Url
 from yaml import load
 try:
    from yaml import CLoader as Loader  # type: ignore
 except ImportError:
    from yaml import Loader  # type: ignore
 class ConfigError(Exception):
    """
    Application configuration error.
    """
    def __init__(self, err):
        self.msg = str(err)
    def __str__(self):
        return f'Application configuration error: {self.msg}'
 class Config:
    """
    Application configuration.
    Access the full application configuration using :meth:`get`.
    It is a dictionary with these keys:
      * 'directory': the configuration directory being used
      * 'main': the main configuration from main.yaml, but
        postgresql configuration may be overriden by environment
        variable ATEXTCRAWLER_POSTGRESQL
    """
    config = None
    @classmethod
    def get(
        cls,
        out: Optional[TextIOBase] = None,
    ) -> Optional[dict]:
        """
        Load and validate app configuration if not already done; return it.
        On errors print them to *out* and if out is sys.stdout, then
        also exit with exit code 2. Otherwise just return None.
        """
        if cls.config:
            return cls.config
        if out is None:
            out = sys.stdout  # type: ignore
        _config = _load_config()
        msg = None
        if isinstance(_config, ConfigError):
            msg = f'ERROR: configuration could not be loaded: {_config}'
        else:
            config = _validate_config(_config)
            if isinstance(config, ConfigError):
                config_dir = _config.get('config_dir')
                msg = (
                    f'ERROR: invalid configuration in {config_dir}:'
                    f' {config}'
                )
        if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
            print(msg, file=out)
            if out == sys.stdout:
                sys.exit(2)
            else:
                return None
        config['postgresql']['min_size'] = config['crawl']['workers'] + 2
        config['postgresql']['max_size'] = config['crawl']['workers'] + 2
        cls.config = config
        return config
 def _load_config() -> Union[ConfigError, dict]:
    """
    Load configuration; search in multiple directories.
    We search these locations; the first location containing main.yaml
    will be used::
      * a directory defined in environment variable ATEXTCRAWLER_CONF
      * subdir .config/atextcrawler in the user's home (`$HOME`)
      * /etc/atextcrawler
    In the same directory where this main.conf is located a subdirectory
    'plugins' must exist and contain the configurations of plugins.
    On failure return the first error and None.
    Otherwise return None and a dict with these keys:
        * `directory`: the used configuration directory
        * `main`: the main application configuration
        * `plugins`: a dict mapping plugins names to plugin configurations
    """
    Path(__file__).parent.parent
    config_dirs = []
    if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
        config_dirs.append(Path(env_conf))
    if env_home := os.environ.get('HOME'):
        config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
    config_dirs.append(Path('/etc/atextcrawler'))
    for config_dir in config_dirs:
        main_yaml_path = config_dir / 'main.yaml'
        if main_yaml_path.exists():
            break
    else:
        locs = ', '.join([str(loc) for loc in config_dirs if loc])
        msg = (
            f'Missing main.yaml in all config locations: {locs}\n'
            f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
            f' to define a custom config directory.'
        )
        return ConfigError(msg)
    # load main.yaml
    try:
        with main_yaml_path.open() as main_yaml:
            main_config = load(main_yaml.read(), Loader=Loader)
    except Exception as err:
        return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
    # main_config must be a dict
    if not isinstance(main_config, dict):
        return ConfigError(f'File {main_yaml_path} must contain a dictionary')
    # postgresql config from environment has precedence
    postgresql_config = _get_env_postgresql()
    if isinstance(postgresql_config, ConfigError):
        return postgresql_config
    main_config['postgresql'] = postgresql_config or main_config['postgresql']
    main_config['config_dir'] = str(config_dir)
    return main_config
 def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
    """
    Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
    Return an error or the PostgreSQL config (which can be None if
    the environment variable is not defined.
    """
    env_var = 'ATEXTCRAWLER_POSTGRESQL'
    value = os.environ.get(env_var, '').strip()
    if not value:
        return None
    param_names = (
        'host',
        'port',
        'database',
        'user',
        'password',
        'schema_name',
    )
    re_dsn = re.compile(
        '((' + '|'.join(param_names) + ')'
        '=("(((?=[^"\\\\]).|\\\\.)*)"'  # value in double quotes
        '|\'(((?=[^\'\\\\]).|\\\\.)*)\''  # value in single quotes
        '|([^"\' ]*)'  # value unquoted
        ')( |$))+?'
    )
    params = {}
    for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
        params[varname] = (
            v3
            or (v1 or '').replace('\\"', '"')
            or (v2 or '').replace("\\'", "'")
        )
    if 'host' not in params:
        params['host'] = 'localhost'
    if 'port' not in params:
        params['port'] = '5432'
    if 'schema_name' not in params:
        params['schema_name'] = 'public'
    for name in param_names:
        if name not in params:
            return ConfigError(
                f'Missing {name} in environment variable {env_var}'
            )
    else:
        params['port'] = int(params['port'])
        return params
 def _validate_config(config: Any) -> Union[ConfigError, dict]:
    """
    Validate the given configuration and fill in default values.
    If invalid, return only the first error.
    Otherwise return the configuration with added default values.
    """
    try:
        return schema_main(config)
    except Exception as err:
        return ConfigError(err)
 def plugins_dir(config):
    """
    Validate plugins directory (absolute or relative path).
    If it is a relative path, prepend the config_dir.
    """
    config_dir = config['config_dir']
    plugins_dir = config['plugins_dir']
    if plugins_dir.startswith('/'):
        try:
            plugins_dir = Path(plugins_dir)
        except:
            raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
    else:
        try:
            plugins_dir = str(Path(config_dir) / Path(plugins_dir))
            config['plugins_dir'] = plugins_dir
        except:
            raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
    if not (Path(plugins_dir) / '__init__.py').exists():
        raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
    return config
 def postgresql_identifier(value):
    """
    Validate a PostgreSQL identifier.
    """
    if not isinstance(value, str) or not re.match(
        '^[a-z][a-z0-9_]{0,30}$', value
    ):
        raise Invalid(
            f'Invalid PostgreSQL identifier "{value}", '
            f'pattern must be: [a-z][a-z0-9_]{0,30}'
        )
    return value
 def positive_number(value):
    """
    Validate a positive number (int or float).
    """
    if (isinstance(value, int) or isinstance(value, float)) and value > 0:
        return value
    raise Invalid('Not a positive number')
 schema_postgresql = Schema(
    {
        Required('host'): All(str, Length(min=1)),
        Required('port', default=5432): All(int, Range(min=0, max=65535)),
        Required('database'): All(str, Length(min=1)),
        Required('user'): All(str, Length(min=1)),
        Required('password'): str,
        Required('schema_name', default='public'): postgresql_identifier,
    }
 )
 schema_crawl = Schema(
    {
        Required('workers', default=10): All(int, Range(min=0, max=1000)),
        Required('site_delay', default=600): positive_number,
        Required('site_revisit_interval', default=3600): positive_number,
        Required('resource_delay', default=5): positive_number,
        Required('full_crawl_interval', default=864000): positive_number,
        Required('feed_crawl_interval', default=86400): positive_number,
    }
 )
 schema_elasticsearch = Schema(
    {
        Required('host'): All(str, Length(min=1)),
        Required('api_key'): All(str, Length(min=1)),
        Required('id'): All(str, Length(min=1)),
        Required('index_base_name'): All(str, Length(min=1)),
    }
 )
 schema_tensorflow = Schema(
    {
        Required('model_server_endpoint'): Url(),
    }
 )
 schema_main = Schema(
    All(
        {
            Required('config_dir'): All(str, Length(min=1)),
            Required(
                'instance_name', default='atextcrawler'
            ): postgresql_identifier,
            Required('instance_type', default='prod'): VAny(
                'dev',
                'staging',
                'prod',
            ),
            Required('log_level', default='info'): VAny(
                'critical',
                'error',
                'warning',
                'info',
                'debug',
            ),
            Required('plugins_dir', default='plugins'): All(
                str, Length(min=1)
            ),
            Required('postgresql'): schema_postgresql,
            Required('crawl'): schema_crawl,
            Required('elasticsearch'): schema_elasticsearch,
            Required('tensorflow'): schema_tensorflow,
        },
        plugins_dir,
    )
 )
 if __name__ == '__main__':
    from pprint import pprint
    pprint(Config().get())
--- a/src/atextcrawler/crawl.py
+++ b/src/atextcrawler/crawl.py
@ -0,0 +1,215 @@
 """
 Crawl a site.
 """
 import logging
 from datetime import datetime
 import aiohttp
 from .models import Crawl
 from .resource import ResourceFetcher, get_site_path, process_site_path
 from .site import (
    RobotsInfo,
    checkin_site,
    checkout_site,
    fetch_feeds,
    process_site,
    update_site,
 )
 from .tensorflow import TensorFlow
 logger = logging.getLogger(__name__)
 class CrawlWorker:
    """
    Worker fetching sites, crawling their resources and storing statistics.
    """
    def __init__(self, app, worker_number, pool):
        self.app = app
        self.worker_number = worker_number
        self.pool = pool
        self.site_delay = self.app.config['crawl']['site_delay']
        self.resource_delay = self.app.config['crawl']['resource_delay']
        self.site = None
        self.crawl = None
        self.running = True  # do crawl
    def __await__(self):
        return self.__ainit__().__await__()
    async def __ainit__(self):
        await self.startup()
        return self
    async def startup(self):
        """
        Asynchronous startup.
        """
        logger.info(f'Starting worker {self.worker_number}')
        self.conn = await self.pool.acquire()
        self.session = aiohttp.ClientSession()
        self.fetcher = ResourceFetcher(self.session)
        self.tf = TensorFlow(self.app, self.session)
    async def shutdown(self):
        """
        Asynchronous shutdown.
        """
        logger.info(f'Shutting down worker {self.worker_number}')
        await self.session.close()
        await self.pool.release(self.conn)
    async def run(self):
        """
        Worker loop: fetch a site, crawl its resources and store statistics.
        If no site needs to be crawled, sleep for self.site_delay seconds
        (configured in crawl.site_delay).
        """
        await self.app.sleep(2)
        while self.app.running and self.running:
            self.site, is_full, more = await checkout_site(self.app, self.conn)
            if not self.site:
                msg = f'Worker {self.worker_number}: sites exhausted'
                logger.debug(msg)
                if not more:
                    await self.app.sleep(self.site_delay)
                continue
            self.crawl = await get_or_create_crawl(
                self.conn, self.site.id_, is_full
            )
            try:
                if is_full:
                    site_upd, _ = await update_site(
                        self.app,
                        self.fetcher,
                        self.conn,
                        self.site.base_url,
                        site=self.site,
                    )
                    if site_upd and site_upd.crawl_enabled:
                        self.site = site_upd
                        await process_site(
                            self.fetcher,
                            self.conn,
                            self.site,
                        )
                elif self.site.crawl_enabled:
                    await fetch_feeds(self.fetcher, self.conn, self.site)
                if self.site.crawl_enabled:
                    await self.crawl_resources()
            except:
                msg = (
                    f'Worker {self.worker_number} failed crawl'
                    f' {self.crawl.id_} of site {self.site.id_}'
                    f' ({self.site.base_url})'
                )
                logger.exception(msg)
            await self.crawl.finish(
                self.conn, self.app.running and self.running
            )
            await checkin_site(self.app, self.conn, self.site, self.crawl)
            msg = (
                f'Worker {self.worker_number} finished crawl'
                f' {self.crawl.id_}'
            )
            logger.debug(msg)
            self.site = None
            # if we were cancelled, but the app is still running, run again
            if self.app.running:
                self.running = True
        msg = f'Closing crawler {self.worker_number}'
        logger.debug(msg)
    async def crawl_resources(self):
        """
        Loop over resources of the site and process them. Collect statistics.
        All workers operate on distinct sites, so no need for locking here.
        """
        crawl_type = 'full' if self.crawl.is_full else 'feed'
        msg = (
            f'Worker {self.worker_number} beginning'
            f' {crawl_type} crawl {self.crawl.id_}'
            f' of site {self.site.id_} ({self.site.base_url})'
        )
        logger.info(msg)
        resource_delay = self.resource_delay
        robots = await RobotsInfo(self.site.base_url)
        if robots.delay:
            resource_delay = robots.delay
        while self.app.running and self.running:
            site_path = await get_site_path(
                self.conn,
                self.site,
                self.crawl.t_begin,
                only_new=not self.crawl.is_full,
            )
            if not site_path:
                msg = (
                    f'Worker {self.worker_number} ending crawl'
                    f' {self.crawl.id_}: paths exhausted'
                )
                logger.info(msg)
                return
            try:
                sp_filter = self.app.plugins['filter_site_path'].sp_filter
                if sp_filter(self.site, site_path.path, robots):
                    is_new_resource = await process_site_path(
                        self.app,
                        self.worker_number,
                        self.conn,
                        self.fetcher,
                        self.tf,
                        self.site,
                        site_path,
                    )
                    if is_new_resource:
                        self.crawl.n_resources_new += 1
                    if is_new_resource is not None:
                        self.crawl.n_resources += 1
                    await self.app.sleep(resource_delay)
                else:
                    sql = (
                        "UPDATE site_path SET"
                        " last_visit=now() at time zone 'UTC',"
                        " filtered=true"
                        " WHERE id=$1"
                    )
                    await self.conn.execute(sql, site_path.id_)
            except:
                msg = (
                    f'Worker {self.worker_number} processing path failed'
                    f' in crawl {self.crawl.id_}: {site_path}'
                )
                logger.exception(msg)
                site_path.ok_count -= 1
                await site_path.save(self.conn)
        msg = (
            f'Worker {self.worker_number}: stopped crawl' f' {self.crawl.id_}'
        )
        logger.info(msg)
 async def get_or_create_crawl(conn, site_id, is_full=True) -> Crawl:
    """
    Return a new or existing+unfinished crawl.
    If an existing crawl is found, return it, disregarding whether
    it is a full crawl or not.
    """
    sql = "SELECT * FROM crawl WHERE site_id=$1 AND t_end is null LIMIT 1"
    if row := await conn.fetchrow(sql, site_id):
        return await Crawl().load_from_row(row)
    else:
        # create a new crawl
        crawl = Crawl(
            site_id=site_id,
            is_full=is_full,
            t_begin=datetime.utcnow(),
        )
        await crawl.save(conn)
        return crawl
--- a/src/atextcrawler/db.py
+++ b/src/atextcrawler/db.py
@ -0,0 +1,162 @@
 """
 PostgreSQL connectivity.
 PGPool can be used as context manager. It takes postgresql configuration
 parameters and gives a connection pool.
 """
 import logging
 import sys
 from io import TextIOBase
 from pathlib import Path
 from traceback import format_exc
 from typing import Dict
 import asyncpg
 from .utils.json import json_dumps, json_loads
 logger = logging.getLogger(__name__)
 class PGPool:
    """
    Database connectivity: Provide a connection pool.
    Can be used either as async context manager (giving a pool),
    or as a class using async init and the shutdown method and
    having the pool attribute.
    After startup self.pool contains a PostgreSQL connection pool
    (instance of :class:`asyncpg.pool.Pool`).
    Startup also runs schema migrations (cf. directory `migrations`).
    """
    def __init__(
        self,
        postgresql_config: dict,
        out: TextIOBase = None,
        check: bool = True,
    ) -> None:
        self.conf = postgresql_config
        self.out = out or sys.stdout
        self.check = check
        self.pool = None
    def __await__(self):
        return self.__ainit__().__await__()
    async def __ainit__(self):
        await self.__aenter__()
        return self
    async def __aenter__(self):
        """
        Return the connection pool after an optional check.
        The check tests basic database access and runs missing migrations.
        If the check fails, return None.
        """
        pool_params = {
            key: val
            for key, val in self.conf.items()
            if key
            in (
                'host',
                'port',
                'database',
                'user',
                'password',
                'max_size',
                'min_size',
            )
        }
        pool_params['command_timeout'] = 30
        self.pool = await asyncpg.create_pool(**pool_params, init=self._init)
        if self.check:
            async with self.pool.acquire() as conn:
                if await self.check_or_migrate(conn):
                    return self.pool
    @staticmethod
    async def _init(conn) -> None:
        """
        Add JSON encoding and decoding to the given connection.
        """
        await conn.set_type_codec(
            'jsonb',
            encoder=json_dumps,
            decoder=json_loads,
            schema='pg_catalog',
        )
    async def __aexit__(self, exc_type, exc, tb) -> None:
        """
        Close the connection pool.
        """
        await self.shutdown()
    async def shutdown(self):
        """
        Close the pool.
        """
        await self.pool.close()
    async def check_or_migrate(self, conn: asyncpg.Connection) -> bool:
        """
        Check database connectivity.
        Return whether database connectivity is working.
        """
        row = await conn.fetchrow('SELECT 1+1 AS result')
        if not row or row.get('result') != 2:
            msg = 'Database SELECT 1+1 not working; missing privileges?'
            print(msg, file=self.out)
            logger.critical(msg)
            return False
        # determine current schema_version
        try:
            sql = "SELECT value::int FROM kvs WHERE key='schema_version'"
            schema_version = await conn.fetchval(sql)
        except:
            schema_version = 0
        # run missing migrations
        migrations = get_migrations()
        for number, text in sorted(migrations.items()):
            if number > schema_version:
                cmds = text.split('\n----\n')
                for cmd in cmds:
                    if not cmd.strip():
                        continue
                    try:
                        await conn.execute(cmd)
                    except:
                        msg = (
                            f'Exception during migration {number} in '
                            f'statement\n{cmd}'
                        )
                        print(msg, file=self.out)
                        logger.critical(msg)
                        print(format_exc(), file=self.out)
                        logger.critical(format_exc())
                        return False
        # return success
        return True
 def get_migrations() -> Dict[int, str]:
    """
    Return migrations (number and text content of migration file).
    """
    migrations_dir = Path(__file__).parent / 'migrations'
    migrations = {}
    for migration_file in migrations_dir.glob('*.sql'):
        migration_number = int(migration_file.name[:-4])
        with migration_file.open() as mig_file:
            content = mig_file.read()
        migrations[migration_number] = content
    return migrations
--- a/src/atextcrawler/migrations/1.sql
+++ b/src/atextcrawler/migrations/1.sql
@ -0,0 +1,297 @@
 CREATE TABLE kvs (
    id bigserial PRIMARY KEY,
    t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
    key varchar(200) NOT NULL UNIQUE,
    value jsonb
 )
 ----
 COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry';
 ----
 COMMENT ON COLUMN kvs.key IS 'Key';
 ----
 COMMENT ON COLUMN kvs.value IS 'Value';
 ----
 COMMENT ON TABLE kvs IS 'Simple key-value store';
 ----
 INSERT INTO kvs (key, value) VALUES ('schema_version', '1');
 ----
 CREATE TABLE site (
    id bigserial PRIMARY KEY,
    canonical_url varchar(200),
    base_url varchar(200) NOT NULL,
    base_urls varchar(200)[] NOT NULL,
    domains varchar(100)[],
    ips inet[] NULL,
    crawl_enabled bool NOT NULL DEFAULT false,
    crawl_active bool NOT NULL DEFAULT false,
    next_full_crawl timestamp,
    next_feed_crawl timestamp,
    last_update timestamp,
    last_pub timestamp,
    pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb,
    langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[],
    alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb,
    title varchar(200),
    description varchar(2000),
    keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[],
    linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb,
    meta_info jsonb NOT NULL DEFAULT '{}'::jsonb,
    boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb
 )
 ----
 CREATE INDEX site__base_url ON site (base_url)
 ----
 CREATE INDEX site__base_urls ON site (base_urls)
 ----
 CREATE INDEX site__domains ON site (domains)
 ----
 CREATE INDEX site__ips ON site (ips)
 ----
 CREATE INDEX site__next_full_crawl ON site (next_full_crawl)
 ----
 CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl)
 ----
 CREATE INDEX site__langs ON site (langs)
 ----
 CREATE INDEX site__title ON site (title)
 ----
 CREATE INDEX site__description ON site (description)
 ----
 CREATE INDEX site__keywords ON site (keywords)
 ----
 COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)'
 ----
 COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content'
 ----
 COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content'
 ----
 COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls'
 ----
 COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed'
 ----
 COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress'
 ----
 COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null'
 ----
 COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null'
 ----
 COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)'
 ----
 COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site'
 ----
 COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date'
 ----
 COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)'
 ----
 COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes'
 ----
 COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags'
 ----
 COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags'
 ----
 COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags'
 ----
 COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)'
 ----
 COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information'
 ----
 COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages'
 ----
 COMMENT ON TABLE site IS 'Website'
 ----
 CREATE TABLE site_queue (
    id bigserial PRIMARY KEY,
    src bigint NULL REFERENCES site(id) ON DELETE CASCADE,
    url varchar(200) NOT NULL,
    link_text varchar(100),
    t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc')
 )
 ----
 CREATE INDEX site_queue__url ON site_queue (url)
 ----
 COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions'
 ----
 COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path'
 ----
 COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site'
 ----
 COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry'
 ----
 COMMENT ON TABLE site_queue IS 'Queued site URLs'
 ----
 CREATE TABLE site_feed (
    id bigserial PRIMARY KEY,
    site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
    url varchar(200) NOT NULL,
    etag text,
    modified varchar(50),
    t_visit timestamp,
    t_content timestamp,
    version varchar(10),
    title varchar(200),
    description text,
    fail_count smallint NOT NULL DEFAULT 0
 )
 ----
 CREATE INDEX site_feed__site ON site_feed (site_id)
 ----
 CREATE INDEX site_feed__t_content ON site_feed (t_content)
 ----
 COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found'
 ----
 COMMENT ON COLUMN site_feed.url IS 'URL of the feed'
 ----
 COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed'
 ----
 COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed'
 ----
 COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival'
 ----
 COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval'
 ----
 COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival'
 ----
 COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival'
 ----
 COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival'
 ----
 COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival'
 ----
 CREATE TABLE site_link (
    id bigserial PRIMARY KEY,
    src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
    dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
    t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
    link_text varchar(100)
 )
 ----
 ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst)
 ----
 CREATE INDEX site_link__src ON site_link (src)
 ----
 CREATE INDEX site_link__dst ON site_link (dst)
 ----
 COMMENT ON COLUMN site_link.src IS 'Source site'
 ----
 COMMENT ON COLUMN site_link.dst IS 'Destination site'
 ----
 COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry'
 ----
 COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site'
 ----
 COMMENT ON TABLE site_link IS 'Cross-site link'
 ----
 CREATE TABLE resource (
    id bigserial PRIMARY KEY,
    simhash bigint,
    content_type varchar(50),
    last_change timestamp,
    text_len int,
    lang char(2),
    title varchar(200),
    summary varchar(2000)
 )
 ----
 COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource'
 ----
 COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header'
 ----
 COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource'
 ----
 COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters'
 ----
 COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code'
 ----
 COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)'
 ----
 COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)'
 ----
 COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)'
 ----
 CREATE TABLE site_path (
    id bigserial PRIMARY KEY,
    site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
    path varchar(400) NOT NULL,
    last_visit timestamp,
    filtered bool NOT NULL DEFAULT false,
    ok_count smallint NOT NULL DEFAULT 0,
    canonical bool,
    resource_id bigint REFERENCES resource(id) ON DELETE CASCADE
 )
 ----
 ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path)
 ----
 CREATE INDEX site_path__site_path ON site_path (site_id, path)
 ----
 CREATE INDEX site_path__resource ON site_path (resource_id)
 ----
 COMMENT ON COLUMN site_path.site_id IS 'Site id'
 ----
 COMMENT ON COLUMN site_path.path IS 'Path'
 ----
 COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival'
 ----
 COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed'
 ----
 COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival'
 ----
 COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval'
 ----
 COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources'
 ----
 CREATE TABLE crawl (
    id bigserial PRIMARY KEY,
    site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
    is_full bool NOT NULL DEFAULT false,
    t_begin timestamp,
    t_end timestamp,
    n_resources int NOT NULL DEFAULT 0,
    n_resources_new int NOT NULL DEFAULT 0
 )
 ----
 CREATE INDEX crawl__site ON crawl (site_id)
 ----
 CREATE INDEX crawl__t_begin ON crawl (t_begin)
 ----
 COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled'
 ----
 COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl'
 ----
 COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl'
 ----
 COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin'
 ----
 COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl'
 ----
 COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl'
 ----
 COMMENT ON TABLE resource IS 'Crawl of resources on a site'
 ----
 CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale')
 ----
 COMMENT ON TYPE site_annotation_type IS 'Type of site annotation'
 ----
 CREATE TABLE site_annotation (
    id bigserial PRIMARY KEY,
    site_id bigint REFERENCES site(id) ON DELETE SET NULL,
    base_url varchar(200) NOT NULL,
    ann_type site_annotation_type NOT NULL,
    ann_content JSONB,
    t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc')
 )
 ----
 CREATE INDEX site_annotation__site ON site_annotation (site_id)
 ----
 CREATE INDEX site_annotation__base_url ON site_annotation (base_url)
 ----
 COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated'
 ----
 COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated'
 ----
 COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type'
 ----
 COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content'
 ----
 COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update'
 ----
 COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'
--- a/src/atextcrawler/models.py
+++ b/src/atextcrawler/models.py
@ -0,0 +1,610 @@
 """
 Data Models.
 """
 import logging
 from dataclasses import InitVar, asdict, dataclass, field, fields
 from datetime import date, datetime
 from itertools import chain
 from typing import Any, ClassVar, Optional
 import tldextract
 from asyncpg import Connection
 from .search import delete_resource
 from .utils.durl import Durl, get_url_variants
 from .utils.link import extract_domain
 from .utils.similarity import get_simhash, simhash_to_bigint
 logger = logging.getLogger(__name__)
 class ModelBase:
    """
    Abstract base class for models.
    Execute SQL to load, save, delete instances using asyncpg.
    """
    table: ClassVar
    id_: Optional[int] = 0
    async def load(self, conn: Connection, id_: int) -> Optional[Any]:
        """
        If loading fails, return None.
        """
        sql = f"SELECT * FROM {self.table} WHERE id=$1"
        row = await conn.fetchrow(sql, id_)
        if not row:
            return None
        return await self.load_from_row(row)
    async def load_from_row(self, row):
        """
        If row is None, return None.
        """
        if not row:
            return None
        data = dict(row)
        self.id_ = data.pop('id')
        self.__init__(**data)
        return self
    async def save(self, conn: Connection) -> None:
        """
        Save the instance (update if self.id_ is set, else insert).
        """
        data = asdict(self)
        # logger.debug(f'Save {self}: id_={self.id_}')
        if self.id_:  # update
            cols = ', '.join(data.keys())
            upds = ', '.join(
                [f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
            )
            val_id = f'${len(data) + 1}'
            sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
            await conn.execute(sql, *data.values(), self.id_)
        else:  # insert
            cols = ', '.join(data.keys())
            vals = ', '.join([f'${i + 1}' for i in range(len(data))])
            sql = (
                f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
                f" RETURNING id"
            )
            self.id_ = await conn.fetchval(sql, *data.values())
    def asdict(self):
        """
        Return instance data as dictionary.
        """
        return asdict(self)
    async def delete(self, conn: Connection) -> None:
        """
        Delete the object if it has an id_.
        """
        if self.id_:
            sql = f"DELETE FROM {self.table} WHERE id=$1"
            await conn.execute(sql, self.id_)
 class ResourceError:
    """
    Error encountered while trying to fetch a resource.
    ResourceError is used for cases when fetching a resource fails.
    """
    def __init__(self, msg, status=None, headers=None):
        self.msg = msg
        self.status = status
        self.headers = headers
    def __repr__(self):
        return f'ResourceError: {self.msg}'
 class ResourceRedirect:
    """
    A resource containing a redirect.
    """
    def __init__(self, urls):
        self.urls = urls
@dataclass
 class TextResource(ModelBase):
    """
    TextResource (without path).
    TextResource models web resources with relevant text content.
    They are instantiated in modules page, document, ...; their metadata
    are stored in table `resource` and the text content is stored with the
    search engine.
    Do not confuse with SitePath: Several SitePath instances
    may point to a TextResource. The TextResource holds the actual content.
    If we are not dealing with the startpage of a new site,
    the init_fields dict usually will contain the site to which
    the resource belongs.
    """
    table: ClassVar = 'resource'
    init_fields: InitVar[dict] = None  # additional fields after fetching
    search_fields: InitVar[dict] = None  # additional fields for indexing
    # database fields
    simhash: Optional[int] = None
    content_type: Optional[str] = None
    last_change: Optional[datetime] = None
    text_len: int = 0
    lang: Optional[str] = None
    title: Optional[str] = None
    summary: Optional[str] = None
    def __post_init__(self, init_fields, search_fields):
        if init_fields is None:
            init_fields = {}
        self.init_fields = init_fields
        if search_fields is None:
            search_fields = {}
        self.search_fields = search_fields
        self.site = self.init_fields.get('site')
        self.site_id = self.site.id_ if self.site else None
        self._update_simhash()
    def __str__(self):
        return (
            f'TextResource(id={self.id_},'
            f' site_id={self.site_id},'
            f' type={self.content_type})'
        )
    def _update_simhash(self):
        """
        Update the simhash of the resource from its text content.
        """
        if self.simhash is None:
            text = self.search_fields.get('text', '')
            self.simhash = simhash_to_bigint(get_simhash(text))
    async def save(self, conn: Connection):
        """
        Save the instance, extending the parent's method.
        """
        self.content_type = (
            self.content_type[:50] if self.content_type else None
        )
        self.title = self.title[:200] if self.title else None
        self.summary = self.summary[:400] if self.summary else None
        self._update_simhash()
        if self.last_change is None:
            self.last_change = datetime.utcnow()
        await super().save(conn)
    async def update_from_resource(self, upd: 'TextResource'):
        """
        Update self with values from another resource.
        """
        names = [field.name for field in fields(self)]
        for name in names:
            cur_val = getattr(self, name)
            upd_val = getattr(upd, name)
            if not cur_val and upd_val is not None:
                setattr(self, name, upd_val)
        init_names = [
            'headers',
            'redirects',
            'links_int',
            'links_ext',
            'shortlinks',
            'canonical',
            #'head',
        ]
        self.init_fields = upd.init_fields
        self.search_fields = upd.search_fields
        # for init_name in init_names:
        #    cur_val = self.init_fields.get(init_name)
        #    upd_val = upd.init_fields.get(init_name)
        #    if not cur_val and upd_val is not None:
        #        self.init_fields[init_name] = upd_val
@dataclass
 class MetaResource(ModelBase):
    """
    Parent class for Feed, Sitemap, SitemapIndex.
    MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
    Their instances are not stored. Note: class Feed contains feed meta data
    and is stored in the database.
    """
@dataclass
 class SitemapIndex(MetaResource):
    """
    A SitemapIndex meta resource.
    Just a list of the siteap URLs, nothing more.
    """
    sitemaps: list = field(default_factory=list)
@dataclass
 class Sitemap(MetaResource):
    """
    A Sitemap meta resource.
    Just a list of the resulting links, nothing more.
    """
    urls: list = field(default_factory=list)
@dataclass
 class Feed(MetaResource):
    """
    A site's feed (RSS, Atom , ...).
    """
    table: ClassVar = 'site_feed'
    entries: InitVar[list] = None
    site_id: Optional[int] = None
    url: Optional[str] = None
    etag: Optional[str] = None
    modified: Optional[str] = None
    t_visit: Optional[datetime] = None
    t_content: Optional[datetime] = None
    version: Optional[str] = None
    title: Optional[str] = None
    description: Optional[str] = None
    fail_count: int = 0
    def __post_init__(self, entries):
        self.entries = entries
    def __str__(self):
        return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'
    async def save(self, conn: Connection):
        """
        Save, trying to merge with existing entry matching on site_id and url.
        """
        if not self.site_id or not self.url:
            msg = f'Saving feed failed: missing site_id of url'
            logger.error(msg)
            return
        sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
        self.id_ = await conn.fetchval(sql, self.site_id, self.url)
        await super().save(conn)
    def debug(self) -> str:
        """
        Return the instance data asa string for debug print output.
        """
        return (
            f'Feed:\n'
            f'- id: {self.id_}\n'
            f'- site_id: {self.site_id}\n'
            f'- url: {self.url}\n'
            f'- etag: {self.etag}\n'
            f'- modified: {self.modified}\n'
            f'- t_visit: {self.t_visit}\n'
            f'- t_content: {self.t_content}\n'
            f'- version: {self.version}\n'
            f'- title: {self.title}\n'
            f'- description: {self.description}\n'
            f'- fail_count: {self.fail_count}\n'
            f'- entries: {self.entries}'
        )
@dataclass
 class Site(ModelBase):
    """
    Website.
    """
    table: ClassVar = 'site'
    base_durl: InitVar[Durl] = None
    feeds: InitVar[dict] = None
    links_ext: InitVar[dict] = None
    links_int: InitVar[dict] = None
    startpage_text: InitVar[str] = None
    canonical_url: Optional[str] = None
    base_url: Optional[str] = None
    base_urls: list[str] = field(default_factory=list)
    domains: list[str] = field(default_factory=list)
    ips: Optional[list[str]] = None
    crawl_enabled: bool = False
    crawl_active: bool = False
    next_full_crawl: Optional[datetime] = None
    next_feed_crawl: Optional[datetime] = None
    last_update: Optional[datetime] = None
    last_pub: Optional[datetime] = None
    pub_dates: Optional[dict[str, str]] = None
    langs: list[str] = field(default_factory=list)
    alt_langs: dict[str, str] = field(default_factory=dict)
    title: Optional[str] = None
    description: Optional[str] = None
    keywords: list[str] = field(default_factory=list)
    linkbacks: dict[str, str] = field(default_factory=dict)
    meta_info: dict = field(default_factory=dict)
    boilerplate_texts: list[str] = field(default_factory=list)
    def __post_init__(
        self,
        base_durl: Durl,
        feeds=None,
        links_ext=None,
        links_int=None,
        startpage_text=None,
    ):
        self.feeds = feeds
        self.links_ext = links_ext
        self.links_int = links_int
        self.startpage_text = startpage_text
        self.keywords = self.keywords[:20]
        if not self.last_update:
            self.last_update = datetime.utcnow()
        pub_date: Optional[str]
        if self.last_pub:
            pub_date = date.isoformat(self.last_pub.date())
            self.pub_dates = {date.isoformat(self.last_update): pub_date}
        else:
            pub_date = None
            self.pub_dates = {}
        if base_durl:
            self.base_urls = [base_durl.url()[:200]]
            self.domains = [extract_domain(base_durl.hostname)[:100]]
    def __str__(self):
        return (
            f'Site(id={self.id_}, url={self.base_url},'
            f' crawl_enabled={self.crawl_enabled})'
        )
    async def update_base_url(self) -> None:
        """
        Update the base_url, choosing the most relevant URL.
        If canonical_url is not None, use this.
        Otherwise set self.base_url to the shortest from self.base_urls,
        but requiring a https-url if there is at least one.
        """
        if self.canonical_url and self.canonical_url not in self.base_urls:
            if canonical_durl := await Durl(self.canonical_url):
                self.base_urls.append(self.canonical_url)
                domain = extract_domain(canonical_durl.hostname)
                if domain not in self.domains:
                    self.domains.append(domain)
        if self.canonical_url:
            self.base_url = self.canonical_url
            return
        if not self.base_url:
            url_candidates = self.base_urls
            if https_urls := [
                url for url in self.base_urls if url.startswith('https://')
            ]:
                url_candidates = https_urls
            self.base_url = min(url_candidates, key=len)
    async def save(  # type: ignore
        self, conn, merge=True
    ) -> tuple[Optional[int], bool]:
        """
        Store the site, optionally trying to merge it with an existing site.
        Return the id of the saved instance and whether a new instance
        was created.
        If self.id_ is not 0, replace the data of the existing site with
        this id. Else if not merge, store as new row, and if merge,
        try to merge with an existing matching site.
        """
        await self.update_base_url()
        if not merge:
            created = not bool(self.id_)
            await super().save(conn)
            return self.id_, created
        if self.id_:
            sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
            row = await conn.fetchrow(sql, self.id_)
            self.base_urls = list(
                set(row['base_urls']).union(set(self.base_urls))
            )
            if previous_pub_dates := row['pub_dates']:
                if not self.pub_dates:
                    self.pub_dates = {}
                self.pub_dates.update(previous_pub_dates)
            await super().save(conn)
            return self.id_, False
        same_site_id = await search_same_site(self, conn)
        if same_site_id:
            same_site = await Site().load(conn, same_site_id)
        if same_site_id and same_site:
            same_site.base_urls = set(same_site.base_urls).union(
                set(self.base_urls)
            )
            same_site.domains = set(same_site.domains).union(set(self.domains))
            if self.canonical_url and not same_site.canonical_url:
                same_site.canonical_url = self.canonical_url
            await same_site.save(conn, merge=False)  # call ourselves
            self.id_ = same_site.id_
            return self.id_, False
        else:
            await super().save(conn)
            return self.id_, True
@dataclass
 class SitePath(ModelBase):
    """
    Path of a website. May point to a Resource.
    """
    table: ClassVar = 'site_path'
    site: InitVar[str] = None
    site_id: Optional[int] = None
    path: Optional[str] = None
    filtered: bool = False
    last_visit: Optional[datetime] = None
    ok_count: int = 0
    canonical: Optional[bool] = None
    resource_id: Optional[int] = None
    def __str__(self):
        return (
            f'SitePath(id={self.id_}, site_id={self.site_id},'
            f' path={self.path})'
        )
    async def save(self, conn: Connection):
        """
        Save the instance, extending the parent's method.
        """
        self.path = self.path[:400] if self.path else ''
        await super().save(conn)
    async def unlink_resource(self, conn, engine, index_base_name):
        """
        Unlink the resource and also delete it, if it has no more links.
        """
        if self.id_:
            if self.resource_id:
                sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
                ref_count = await conn.fetchval(sql, self.resource_id)
                if ref_count == 0:
                    sql = (
                        "DELETE FROM resource WHERE id=$1"
                        " RETURNING (true, lang)"
                    )
                    found = await conn.fetchval(sql, self.resource_id)
                    if found:
                        await delete_resource(
                            engine, found[1], self.resource_id
                        )
                self.resource_id = None
    def url(self, site):
        """
        Return the full URL (combine the site's base_url with our path).
        """
        return site.base_url + self.path
@dataclass
 class Crawl(ModelBase):
    """
    The crawl process of a website (begin, end, statistics, ...).
    """
    table: ClassVar = 'crawl'
    site_id: Optional[int] = None
    is_full: bool = False
    t_begin: datetime = datetime.utcnow()
    t_end: Optional[datetime] = None
    n_resources: int = 0
    n_resources_new: int = 0
    async def finish(self, conn, set_t_end):
        """
        Save the crawl. Set t_end only if indicated.
        """
        if set_t_end:
            self.t_end = datetime.utcnow()
        await self.save(conn)
 async def search_same_site(
    site: Site,
    conn: Connection,
 ) -> Optional[int]:
    """
        Try to find a matching site for the given *site* and return its id.
    TODO: if the path is non-trivial, require it also for the matching site
        Two sites match when they return the same content for identical paths.
        The base_url (scheme and/or netloc) may differ.
        We do not have the content for all paths of both websites, so we need
        to estimate: We only take into account meta information from the
        start pages of both sites, in particular the title, description
        and information obtained the base_urls:
        We use a combination of these conditions:
          1. one of the sites has a canonical URL which matches the
             URL of the other site
          2. the content fields (title, description) have sufficient information
          3. the content fields match exactly
          4. the domain matches
          5. the domain matches, except for the TLD
          6. the base_urls differ in their schemes (http vs. https)
          7. the hostnames in the base_urls are identical
          8. the hostnames in the base_urls differ by a prepended 'www.'
          9. the IPs have at least one common address
        The algorithm is this (first answer is final, yes means match):
          * if (1) : yes
          * if (2), (3), (4) : yes
          * if (2), (3), (5), (9) : yes
          * if (6), ((7) or (8)) : yes
          * no
    """
    # rule (1)
    if site.canonical_url:
        sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
        id_ = await conn.fetchval(sql, site.canonical_url)
        if id_:
            return id_
    else:
        sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
        id_ = await conn.fetchval(sql, site.base_urls)
        if id_:
            return id_
    # rule (6), ((7) or (8))
    url_variants = set(
        chain.from_iterable(
            get_url_variants(base_url) for base_url in site.base_urls
        )
    )
    sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
    if id_ := await conn.fetchval(sql, url_variants):
        return id_
    # condition (2)
    if len(site.title or '') > 15 or len(site.description or '') > 15:
        sql = (
            f"SELECT * FROM site WHERE"
            f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
        )
        rows = await conn.fetch(sql, site.title or '', site.description or '')
        # condition (3)
        if rows:
            # condition (4)
            for row in rows:
                domains = set(row.get('domains', []))
                if domains & set(site.domains):
                    return row['id']
            # condition (9)
            for row in rows:
                ips = set(row.get('ips', []))
                if site.ips and ips & set(site.ips):
                    # condition (5)
                    domains_ = row.get('domains', [])
                    d1 = set([tldextract.extract(d).domain for d in domains_])
                    domains_ = site.domains or []
                    d2 = set([tldextract.extract(d).domain for d in domains_])
                    if d1 & d2:
                        return row['id']
    return None
--- a/src/atextcrawler/plugin_defaults/init.py
+++ b/src/atextcrawler/plugin_defaults/init.py
--- a/src/atextcrawler/plugin_defaults/filter_resource_path.py
+++ b/src/atextcrawler/plugin_defaults/filter_resource_path.py
@ -0,0 +1,22 @@
 """
 Filter paths found in a resource.
 This plugin implements :func:`rp_filter`.
 """
 from typing import Optional
 def rp_filter(site, durl) -> Optional[str]:
    """
    Adjust or filter found paths (may depend on site).
    To filter out a path (i.e., not add it to table `site_path`)
    return None.
    """
    path = durl.pwa()
    # skip fetching images (linked from a tags; img tags are skipped anyway)
    if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
        return None
    path = path.removesuffix('?amp=1')
    return path
--- a/src/atextcrawler/plugin_defaults/filter_site.py
+++ b/src/atextcrawler/plugin_defaults/filter_site.py
@ -0,0 +1,47 @@
 """
 Relevance estimation of sites.
 This plugin implements :func:`site_filter`.
 """
 import re
 from atextcrawler.models import Site
 MIN_RELEVANCE_SCORE = 5
 async def site_filter(site: Site) -> bool:
    """
    Assess relevance of the site (using language-dependent criteria).
    If the site shall be crawled, return True, else False.
    """
    # limit to sites in English or German language
    if not set(['de', 'en']) & set(site.langs):
        return False
    score = 0.0
    for crit_name, weight, langs, crit_re in re_criteria:
        if '*' in langs or set(langs) & set(site.langs):
            findings = crit_re.findall(site.startpage_text)
            if findings:
                score += weight * len(findings)
            if site.title and crit_re.search(site.title):
                score += 4 * weight
            if site.description and crit_re.search(site.description):
                score += 4 * weight
    # TODO: add criteria for named entities (FdA-IFA, FAU, ...)
    return score >= MIN_RELEVANCE_SCORE
 re_criteria = {
    (
        'anarch',
        1.0,
        ('*',),
        re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
    ),
    ('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
 }
--- a/src/atextcrawler/plugin_defaults/filter_site_path.py
+++ b/src/atextcrawler/plugin_defaults/filter_site_path.py
@ -0,0 +1,24 @@
 """
 Plugin for filtering paths of a site to be retrieved.
 This plugin implements :func:`sp_filter`.
 """
 def sp_filter(site, path, robots) -> bool:
    """
    Per-site path filter. Return whether the path shall be retrieved.
    """
    if not robots.can_fetch_url(site.base_url + path):
        return False
    if 'amusewiki' in site.meta_info.get('generator', '').lower():
        if any(
            [
                path.endswith(end)
                for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
            ]
        ):
            return False
        if '/bbselect?' in path:
            return False
    return True
--- a/src/atextcrawler/resource/init.py
+++ b/src/atextcrawler/resource/init.py
@ -0,0 +1,10 @@
 from .dedup import store_boilerplate_texts
 from .feed import feed_types, update_feed
 from .fetch import ResourceFetcher
 from .operations import (
    add_site_paths,
    get_site_path,
    process_site_path,
    store_feed_entries,
 )
 from .sitemap import extract_sitemap_paths, get_sitemap_urls
--- a/src/atextcrawler/resource/main.py
+++ b/src/atextcrawler/resource/main.py
@ -0,0 +1,96 @@
 """
 Dev tool for fetching and displaying a resource.
 Has no permanent effects.
 """
 import asyncio
 import logging
 import sys
 from collections import defaultdict
 from pprint import pformat
 import aiohttp
 from ..models import Feed, TextResource
 from ..resource import ResourceFetcher
 from ..utils.annotation import pack_annotations, unpack_annotations
 from ..utils.durl import Durl
 logger = logging.getLogger()
 logger.setLevel(logging.DEBUG)
 logger.addHandler(logging.StreamHandler())
 logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
 logger_page_debug.setLevel(logging.DEBUG)
 def add_tags(text, annotations):
    """
    Reconstruct html from text and annotations.
    This is very similar to what the client does when displaying
    a cached hit.
    """
    html = ''
    opening_tags = defaultdict(list)
    closing_tags = defaultdict(list)
    anns_tags = sorted(
        annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
    )
    for (i, f), anns in anns_tags:
        opening_tags[i] += [tag for tag in reversed(anns)]
        closing_tags[f] += [tag for tag in reversed(anns)]
    positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
    last_pos = 0
    links = {i: href for href, (i, f, rel) in annotations['links'].items()}
    for pos in positions:
        html += text[last_pos:pos]
        closing = closing_tags.get(pos, [])
        opening = opening_tags.get(pos, [])
        common = set(closing) & set(opening)
        closing = [tag for tag in closing if tag not in common]
        opening = [tag for tag in opening if tag not in common]
        tags_html = ''
        for tag in reversed(closing):
            html += f'</{tag}>\n'
        for tag in opening:
            if tag == 'a':
                href = links.get(pos, '#')
                html += f'<a href="{href}">'
            else:
                html += f'<{tag}>'
        last_pos = pos
    return html
 async def run():
    """
    Fetch and display a resource with URL given as cmdline argument.
    """
    url = sys.argv[1]
    async with aiohttp.ClientSession() as session:
        if not (durl := await Durl(url)):
            return
        fetcher = ResourceFetcher(session)
        resource = await fetcher.fetch(url)
        if isinstance(resource, TextResource):
            logger.warning(repr(resource))
            logger.warning(f'Language: {resource.lang}')
            logger.warning(pformat(resource.search_fields))
            logger.warning(pformat(resource.init_fields))
            # annotations = resource.search_fields.get('annotations')
            # text = resource.search_fields['text']
            # with open('/tmp/1.html', 'w') as f:
            #    html = add_tags(text, annotations)
            #    f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
            #            f'<body>\n{html}\n</body></html>')
        elif isinstance(resource, Feed):
            logger.warning(resource.debug())
        else:
            logger.warning(f'Resource has type {type(resource)}')
            logger.warning(resource)
 if __name__ == '__main__':
    asyncio.run(run())
--- a/src/atextcrawler/resource/dedup.py
+++ b/src/atextcrawler/resource/dedup.py
@ -0,0 +1,59 @@
 """
 Find boilerplate texts.
 """
 from collections import Counter
 from ..models import TextResource
 from ..utils.probe import extract_samples
 from ..utils.section import iter_sections
 async def store_boilerplate_texts(fetcher, conn, site):
    """
    Find and store boilerplate texts of a site.
    Fetch the start page and internal sample links obtained from it.
    If there are sufficienty frequently appearing text sections,
    consider them as boilerplate texts.
    If boilerplate_texts were found, update the given site instance.
    """
    startpage = await fetcher.fetch(site.base_url, site=site)
    if (
        not isinstance(startpage, TextResource)
        or startpage.content_type != 'html'
    ):
        return
    # fetch sample resources
    sample_links = extract_samples(startpage.init_fields['links_int'])
    resources = [startpage]
    for sample_link in sample_links:
        if sample_link.path == site.base_url:  # avoid duplicate resources
            continue  # NB: duplicate resources may have different paths
        sample_resource = await fetcher.fetch(sample_link.url(), site=None)
        if (
            isinstance(sample_resource, TextResource)
            and sample_resource.content_type == 'html'
        ):
            resources.append(sample_resource)
    # find common texts in resources
    if (n_resources := len(resources)) > 2:
        text_freq = Counter()
        for resource in resources:
            text = resource.search_fields['text']
            semantic_breaks = resource.search_fields['annotations'][
                'semantic_breaks'
            ]
            for sec in iter_sections(text, semantic_breaks):
                text_freq[sec[3]] += 1
        boilerplate_texts = []
        if min(text_freq.values() or [0]) == 1:  # no resource fetched twice
            for text, freq in text_freq.items():
                if freq > 2:
                    boilerplate_texts.append(text)
            sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
            await conn.execute(sql, boilerplate_texts, site.id_)
            site.boilerplate_texts = boilerplate_texts
--- a/src/atextcrawler/resource/document.py
+++ b/src/atextcrawler/resource/document.py
@ -0,0 +1,131 @@
 """
 Parse documents (often application/pdf).
 """
 import logging
 import re
 from datetime import datetime
 from typing import Optional, Union
 from tika import parser
 from ..models import ResourceError, ResourceRedirect, Site, TextResource
 from ..utils.durl import Durl
 from ..utils.http import get_header_links
 from ..utils.lang import extract_content_language
 from .plaintext import annotate_text
 logger = logging.getLogger(__name__)
 logger_debug = logging.getLogger(__name__ + '.debug')
 logger_debug.setLevel(logging.INFO)
 re_url = re.compile(
    r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
    r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
 )
 async def parse_document(
    durl: Durl,
    resp: dict,
    site: Optional[Site],
 ) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
    """
    Extract plain text from documents in various formats.
    """
    content = resp['content']
    # HTTP headers, canonical URL, shortlink
    header_links = await get_header_links(resp['headers'], durl, site)
    if canonical := header_links.get('canonical'):
        if canonical != durl.url():
            return ResourceRedirect(resp['redirects'] + [canonical])
    shortlink = header_links.get('shortlink')
    # use tika to extract text
    doc = parser.from_buffer(content)
    # logger.debug(pformat(doc))
    if doc.get('status') != 200:
        msg = f'Analyzing document failed: {durl.url()}'
        return ResourceError(msg)
    # collect meta data
    meta = doc.get('metadata', {})
    content_type = meta.get('Content-Type')
    if isinstance(content_type, list):
        content_type = content_type[-1]
    title = concat(meta.get('title'))
    concat(meta.get('creator'))
    last_change = extract_latest(meta.get('date') or meta.get('created'))
    keywords = None
    # text content
    text = (doc.get('content') or '').strip()
    # links
    links_int: dict[Durl, tuple[list[str], str]] = {}
    links_ext: dict[Durl, tuple[list[str], str]] = {}
    for url in re_url.findall(text):
        link_durl = await Durl(url[0])
        if link_durl:
            if link_durl.site() == durl.site():
                links_int[link_durl] = [], link_durl.url()
            else:
                links_ext[link_durl] = [], link_durl.url()
    # annotations
    text, annotations = annotate_text(text)
    return TextResource(
        content_type=content_type,
        last_change=last_change,
        text_len=len(text),
        lang=extract_content_language(text),
        title=title,
        init_fields={
            'durl': durl,
            'site': site,
            'headers': resp['headers'],
            'redirects': resp['redirects'],
            'links_int': links_int,
            'links_ext': links_ext,
            'shortlink': shortlink,
            'canonical': None,
        },
        search_fields={
            'title': title,
            'pub_date': last_change,
            'keywords': keywords,
            'text': text,
            'annotations': annotations,
        },
    )
 def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
    """
    Extract the lastest date (if any) from a string or list of strings.
    """
    if not s:
        return None
    if not isinstance(s, list):
        s = [s]
    dt = []
    for t in s:
        try:
            dt.append(datetime.fromisoformat(t.rstrip('Z')))
        except:
            pass
    return max(dt) if dt else None
 def concat(s: Optional[Union[str, list]]) -> Optional[str]:
    """
    Helper function for joining strings together.
    """
    if not s:
        return None
    if not isinstance(s, list):
        s = [s]
    return ' '.join(s)
--- a/src/atextcrawler/resource/feed.py
+++ b/src/atextcrawler/resource/feed.py
@ -0,0 +1,155 @@
 """
 Stuff related to feeds.
 Higher-level stuff is in site.feeds.
 """
 import logging
 from datetime import datetime, timezone
 from typing import Optional, Union
 from asyncpg import Connection
 from feedparser import parse
 from ..models import Feed, MetaResource, ResourceError
 from ..utils.durl import Durl
 logger = logging.getLogger(__name__)
 feed_types = (
    'application/rss+xml',
    'application/atom+xml',
    'application/feed+json',
 )
 async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
    """
    Fetch, parse and return a given feed's content. Also update *feed*.
    If the server replied with HTTP 410, delete the feed.
    If there is no new information (server replied with HTTP 304),
    return None. For other errors also return None and increase the
    fail_count.
    """
    headers = {'Cache-control': 'max-age=600'}
    if feed.modified:
        headers['If-Modified-Since'] = feed.modified
    elif feed.etag:
        headers['If-None-Match'] = feed.etag.removeprefix('W/')
    resource = await fetcher.fetch(feed.url, headers=headers)
    if isinstance(resource, ResourceError):
        if resource.status == 410:
            msg = f'Feed has vanished, deleting it: {feed}'
            logger.debug(msg)
            await feed.delete(conn)
        if resource.status != 304:
            feed.fail_count += 1
            if feed.fail_count > 5:
                msg = f'Feed not reachable, deleting it: {feed}'
                logger.debug(msg)
                await feed.delete(conn)
        return None  # HTTP 304, no new entries
    elif isinstance(resource, Feed):
        resource.id_ = feed.id_
        resource.site_id = feed.site_id
        await resource.save(conn)
        return resource.entries
    else:
        return None
 def parse_json_feed(resp, data: dict) -> Feed:
    """
    Parse a JSON response for jsonfeed information.
    TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
    """
    feed = Feed()
    feed.url = data.get('feed_url', resp['redirects'][-1])
    feed.etag = resp['headers'].get('ETag')
    feed.modified = resp['headers'].get('Last-Modified')
    feed.t_visit = datetime.utcnow()
    version = data.get('version', '')
    version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
    feed.version = version[:10]
    feed.title = data.get('title')
    feed.description = data.get('description')
    feed.fail_count = 0
    entries = []
    latest = None
    # parse feed entries to a dict compatible with feedparser's entries
    for feed_item in data.get('items', []):
        entry = {}
        entry['link'] = feed_item.get('url')
        dt = feed_item.get('date_published')
        if dt:
            dt = datetime.fromisoformat(dt) if dt else None
            dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
            entry['published_parsed'] = dt.timetuple()
        entry['title'] = feed_item.get('title')
        entry['summary'] = feed_item.get('summary')
        entries.append(entry)
        if dt:
            latest = max(latest or dt, dt)
    feed.entries = entries
    feed.t_content = latest
    return feed
 def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
    """
    Parse a response from Fetcher.get_resp() for xml feed information.
    """
    feed = Feed()
    feed.url = resp['redirects'][-1]
    feed.etag = resp['headers'].get('ETag')
    feed.modified = resp['headers'].get('Last-Modified')
    feed.t_visit = datetime.utcnow()
    try:
        parsed = parse(resp['content'], response_headers=resp['headers'])
    except Exception as error:
        return ResourceError(f'Feedparser error: {error}')
    latest = parsed['feed'].get('updated_parsed')
    if latest:
        latest = datetime(*latest[:6])
        feed.t_content = max(feed.t_content or latest, latest)
    feed.version = parsed['version']
    feed.title = parsed['feed'].get('title', '')[:200] or None
    feed.description = parsed['feed'].get('description')
    feed.fail_count = 0
    feed.entries = parsed['entries']
    return feed
 def convert_feed_entries(
    base_url: Optional[str],
    entries: list[dict],
 ) -> tuple[
    list[tuple[str, bool]],
    dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
 ]:
    """
    Extract paths and resource meta information from a feed's entries.
    Return paths in a structure wanted by :func:`add_site_paths` and
    resource meta information in a structure wanted by
    :func:`update_resource_meta`.
    """
    paths = []
    resource_meta = {}
    for entry in entries:
        if entry.get('link') and entry['link'].startswith(base_url or ''):
            path = entry['link'].removeprefix(base_url or '').lstrip('/')
            if len(path) <= 200:
                last_update = entry.get('published_parsed')
                if last_update:
                    last_update = datetime(*last_update[:6])
                paths.append((path, True))
                resource_meta[path] = (
                    last_update,
                    entry.get('title', '')[:200] or None,
                    entry.get('summary', '')[:2000] or None,
                )
    return paths, resource_meta
--- a/src/atextcrawler/resource/fetch.py
+++ b/src/atextcrawler/resource/fetch.py
@ -0,0 +1,327 @@
 """
 Access to a resource specified by a URL.
 """
 import gzip
 import logging
 from json import loads
 from traceback import format_exc
 from typing import Any, Optional, Union
 import aiohttp
 from bs4 import BeautifulSoup
 from ..models import (
    Feed,
    MetaResource,
    ResourceError,
    ResourceRedirect,
    Site,
    TextResource,
 )
 from ..utils.durl import Durl
 from ..utils.link import in_blacklist
 from .document import parse_document
 from .feed import parse_json_feed, parse_xml_feed
 from .page import parse_html
 from .plaintext import parse_plaintext
 from .sitemap import parse_sitemap, parse_sitemapindex
 logger = logging.getLogger(__name__)
 MAX_REDIRECTS = 10
 """
 Maximum number of redirects to follow.
 """
 default_headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
    ' Gecko/20100101 Firefox/78.0',
    'DNT': '1',
    'Upgrade-Insecure-Requests': '1',
    'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
 }
 """
 Default HTTP client headers, overwriting those of aiohttp.ClientSession.
 """
 blacklist_content_types = [
    '',
    'application/ogg',
 ]
 """
 Blacklist for content-types.
 """
 text_content_types = {
    'text/html': 'html',
    'text/plain': 'plain',
    'application/rss+xml': 'feed-rss',
    'application/atom+xml': 'feed-atom',
    'application/feed+json': 'feed-json',
    'application/json': 'json',
    'application/xml': 'xml',
    'text/xml': 'xml',
 }
 """
 Map content-types to parsers.
 """
 class ResourceFetcher:
    """
    Fetch a resource specified by a URL (:meth:`fetch`).
    The timeout is the same for all requests.
    """
    def __init__(
        self,
        session: aiohttp.ClientSession,
        timeout_sock_connect: Union[int, float] = 8,
        timeout_sock_read: Union[int, float] = 30,
    ):
        self.session = session
        self.timeout = aiohttp.ClientTimeout(
            sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
        )
    async def fetch(
        self,
        url: str,
        site: Optional[Site] = None,
        redirect_history: Optional[list[str]] = None,
        headers: Optional[dict] = None,
    ) -> Union[
        None, MetaResource, TextResource, ResourceError, ResourceRedirect
    ]:
        """
        Try to fetch a resource and return an instance or error or redirect.
        If an error was encountered, return a ResourceError.
        If the resource has an irrelevant content type, return None.
        Otherwise return a specific content instance.
        Argument *redirect_history* contains the redirect history;
        if one of the redirects is encountered again, return None.
        """
        if redirect_history is None:
            redirect_history = []
        if not (durl := await Durl(url)):
            return ResourceError('Invalid URL')
        resp = await self.get_resp(
            durl,
            redirect_history=redirect_history,
            headers=headers,
        )
        if isinstance(resp, ResourceError):
            return resp
        if resp is None:
            return None
        result = await self._parse(durl, site, resp)
        if isinstance(result, (MetaResource, TextResource)):
            result.id_ = None
        return result
    async def _parse(
        self, durl, site, resp, in_recursion=False
    ) -> Union[
        None, MetaResource, TextResource, ResourceError, ResourceRedirect
    ]:
        """
        Parse a response. May call itself.
        """
        result: Union[
            None, MetaResource, TextResource, ResourceError, ResourceRedirect
        ] = None
        content = resp['content']
        if isinstance(content, str) and content.startswith('<?xml '):
            result = await parse_xml(durl, resp)
        elif resp['parser'] == 'feed-rss':
            result = await parse_xml(durl, resp, rss=True)
        elif resp['parser'] == 'feed-atom':
            result = await parse_xml(durl, resp, atom=True)
        elif resp['parser'] == 'xml':
            result = await parse_xml(durl, resp)
        elif resp['parser'] == 'html':
            result = await parse_html(durl, resp, site)
        elif resp['parser'] in ('json', 'feed-json'):
            result = await parse_json(durl, resp)
        elif resp['parser'] == 'plain':
            result = await parse_plaintext(durl, resp, site)
        elif resp['parser'] == 'application':
            if resp['headers'].get('content-type') == 'application/x-gzip':
                if in_recursion:
                    return None  # consider nested gzip an attack
                resp['content'] = gzip.decompress(resp['content'])
                return await self._parse(durl, site, resp, in_recursion=True)
            result = await parse_document(durl, resp, site)
        if isinstance(result, ResourceRedirect):
            redir_url = result.urls[-1]
            result = await self.fetch(
                redir_url,
                site=site,
                redirect_history=result.urls[:-1],
            )
        return result
    async def get_resp(
        self,
        durl: Durl,
        headers: dict = None,
        redirect_history: Optional[list[str]] = None,
    ) -> Optional[Union[ResourceError, dict]]:
        """
        Try to fetch a url returning a ResourceError or a dict with content.
        Optional *headers* will overwrite the :var:`default_headers`.
        If the response status is not 200, always return an ResourceError.
        If the content-type is not relevant (see blacklist_content_types),
        return None.
        The dict contains these keys+values:
          * 'parser': a hint on the parser to use for analyzing the content;
             one of 'html', 'plain', 'feed', 'xml', 'application'
          * 'content': bytes for type application, otherwise str
          * 'redirects': a list of URLs visited during HTTP redirection,
                         the last item is the final URL
          * 'headers': response headers
        """
        if redirect_history is None:
            redirect_history = []
        if len(redirect_history) >= MAX_REDIRECTS:
            return None
        headers_ = default_headers.copy()
        if headers:
            headers_.update(headers)
        try:
            async with self.session.get(
                durl.url(),
                headers=headers_,
                timeout=self.timeout,
            ) as resp:
                redirects = [durl.url()]
                if resp.history:
                    href = resp.history[-1].headers.get('location')
                    if not href or not (redurl := await Durl(href, base=durl)):
                        msg = 'Invalid URL after HTTP redirect'
                        return ResourceError(msg)
                    if in_blacklist(redurl.hostname):
                        src_url = (
                            redirect_history[0]
                            if redirect_history
                            else durl.url()
                        )
                        msg = (
                            f'Dropping URL {src_url}, since'
                            f' redirected to a blacklisted site'
                        )
                        logger.debug(msg)
                        return None
                    redirects = [str(r.url) for r in resp.history]
                    redirects.append(redurl.url())
                if join := set(redirect_history) & set(redirects):
                    msg = f'Cyclic redirect {join}'
                    return ResourceError(msg)
                if resp.status != 200:
                    msg = f'HTTP status {resp.status}'
                    return ResourceError(
                        msg, status=resp.status, headers=headers
                    )
                c_type = resp.headers.get('content-type', '').split(';')[0]
                if c_type in blacklist_content_types:
                    return None
                result: dict[str, Any] = {
                    'redirects': redirect_history + redirects,
                    'headers': resp.headers,
                }
                if c_type in text_content_types.keys():
                    try:  # catch decoding issues
                        content = await resp.text()
                    except:
                        body = await resp.read()
                        encoding = resp.charset or 'utf-8'
                        encoding = encoding.replace('CP-1250', 'cp1250')
                        content = body.decode(encoding, errors='replace')
                    result['content'] = content
                    result['parser'] = text_content_types[c_type]
                    return result
                elif c_type.startswith('application/'):
                    result['content'] = await resp.read()
                    result['parser'] = 'application'
                    return result
        except aiohttp.ClientError as error:
            # on certificate error try without tls
            if 'SSLCertVerificationError' in str(error):
                if durl.scheme == 'https':
                    url = durl.url()
                    durl.replace_scheme('http')
                    response = await self.get_resp(
                        durl=durl,
                        headers=headers,
                        redirect_history=redirect_history + [url],
                    )
                    if not isinstance(response, ResourceError):
                        return response
            msg = f'ClientError: {error}'
            return ResourceError(msg)
        except Exception as error:
            msg = f'Unknown error: {error}:\n{format_exc()}'
            logger.error(msg)
            return ResourceError(msg)
        return None
 async def parse_xml(
    durl: Durl,
    response: dict,
    rss=False,
    atom=False,
 ) -> Optional[Union[MetaResource, ResourceError]]:
    """
    Parse XML content.
    In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
    """
    try:
        xml = response['content']
        soup = BeautifulSoup(xml, 'html.parser')
    except:
        return None
    if rss or (rss := soup.find('rss')):
        return parse_xml_feed(response)
    elif atom or (atom := soup.find('atom')):
        return parse_xml_feed(response)
    elif sitemapindex := soup.find('sitemapindex'):
        return parse_sitemapindex(sitemapindex)
    elif urlset := soup.find('urlset'):
        return parse_sitemap(urlset)
    else:
        return None
 async def parse_json(
    durl: Durl,
    response: dict,
 ) -> Optional[Union[Feed, ResourceError]]:
    """
    Parse the content of JSON feeds.
    """
    try:
        data = loads(response['content'])
    except:
        msg = f'Could not parse JSON from {durl.url()}'
        logger.debug(msg)
        return None
    if not isinstance(data, dict):
        return None
    if data.get('version', '').startswith('https://jsonfeed.org/'):
        return parse_json_feed(response, data)
    return None
--- a/src/atextcrawler/resource/operations.py
+++ b/src/atextcrawler/resource/operations.py
@ -0,0 +1,347 @@
 """
 Operations on resources.
 """
 import logging
 from datetime import datetime
 from typing import Optional, Sequence
 from asyncpg import Connection
 from ..models import (
    Feed,
    MetaResource,
    ResourceError,
    Site,
    Sitemap,
    SitemapIndex,
    SitePath,
    TextResource,
 )
 from ..search import delete_resource, index_resource
 from ..tensorflow import TensorFlow
 from ..utils.durl import Durl
 from ..utils.similarity import (
    create_simhash,
    search_simhash,
    simhash_from_bigint,
    simhash_to_bigint,
 )
 from .feed import convert_feed_entries
 from .fetch import ResourceFetcher
 from .sitemap import extract_sitemap_paths
 logger = logging.getLogger(__name__)
 async def add_site_paths(
    conn: Connection,
    site_id: int,
    paths: Sequence[tuple[str, Optional[bool]]],
 ) -> None:
    """
    Add site paths. if resource infos are given, also create resources.
    The paths must be given as relative paths and together with a boolean
    telling whether the link is a canonical link.
    """
    sql = (
        "INSERT INTO site_path (site_id, path, canonical)"
        " VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING"
    )
    values = (
        (site_id, path, canonical)
        for path, canonical in paths[:100000]
        if len(path) <= 400
    )
    await conn.executemany(sql, values)
 async def update_resource_meta(
    conn: Connection,
    site_id: int,
    resource_meta: dict,
 ) -> None:
    """
    Update meta information of existing resources using path to find them.
    """
    sql = (
        "UPDATE resource SET last_change=coalesce($1, last_change),"
        " title=coalesce($2, title), summary=coalesce($3, summary) FROM ("
        " SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5"
        ") sp WHERE resource.id=sp.resource_id"
    )
    values = ((*meta, site_id, path) for path, meta in resource_meta.items())
    await conn.executemany(sql, values)
 async def store_feed_entries(
    conn: Connection,
    site: Site,
    entries: list[dict],
 ) -> None:
    """
    Add missing resources of a site from given feed entries.
    """
    if site.id_:
        paths, resource_meta = convert_feed_entries(site.base_url, entries)
        await add_site_paths(conn, site.id_, paths)
        await update_resource_meta(conn, site.id_, resource_meta)
 async def get_site_path(
    conn: Connection,
    site: Site,
    before: datetime,
    only_new=False,
 ) -> Optional[SitePath]:
    """
    Return the next path of a given site that needs to be processed.
    If none needs to be processed, return None.
    Only return paths that have last been visited before *before*
    or not been processed at all. Paths with a ok_count of -3 or lower
    are dropped.
    If *only_new*, limit to paths that have not been processed at all,
    irrespective of the value of *before*.
    """
    if only_new:
        sql = (
            "SELECT * FROM site_path"
            " WHERE site_id=$1 AND last_visit is null LIMIT 1"
        )  # implicitly canonical=null
        row = await conn.fetchrow(sql, site.id_)
    else:
        sql = (
            "SELECT * FROM site_path"
            " WHERE site_id=$1 AND canonical IS NOT false AND"
            " (last_visit is null OR last_visit<$2) AND"
            " ok_count > -3 LIMIT 1"
        )  # canonical can be true or null
        row = await conn.fetchrow(sql, site.id_, before)
    if row:
        return await SitePath().load_from_row(row)
    return None
 async def process_site_path(
    app,
    worker_number: int,
    conn: Connection,
    fetcher: ResourceFetcher,
    tf: TensorFlow,
    site: Site,
    site_path: SitePath,
 ) -> bool:
    """
    Fetch a path, deduplicate and if canonical, update and index the resource.
    Return whether a new resource was handled that should contribute be
    statistics.
    """
    msg = (
        f'Worker {worker_number} processing site {site.id_}'
        f' site_path {site_path.id_} {site.base_url}{site_path.path}'
    )
    logger.debug(msg)
    if not site.id_:  # only to satisfy typing
        return False
    # fetch url
    site_path.last_visit = datetime.utcnow()
    url = site_path.url(site)
    resource = await fetcher.fetch(url, site=site)
    # handle failure (possibly deleting old information)
    if not isinstance(resource, (TextResource, MetaResource)):
        if not resource:  # irrelevant content-type
            site_path.ok_count = -10
        elif isinstance(resource, ResourceError):
            site_path.ok_count -= 1
        if site_path.ok_count <= -3 and site_path.resource_id:
            await site_path.unlink_resource(
                conn,
                app.search_engine,
                app.config['elasticsearch']['index_base_name'],
            )
        await site_path.save(conn)
        if resource:  # relevant content-type
            msg = (
                f'Worker {worker_number} failed to process site_path'
                f' {site_path.id_} (site {site.id_},'
                f' {site.base_url}{site_path.path})'
            )
            logger.info(msg)
        return False
    # handle MetaResources
    if isinstance(resource, MetaResource):
        if isinstance(resource, Feed):
            resource.site_id = site.id_
            await resource.save(conn)
            if resource.entries:
                await store_feed_entries(conn, site, resource.entries)
        elif isinstance(resource, Sitemap):
            paths, _ = extract_sitemap_paths(site.base_url, resource.urls)
            await add_site_paths(conn, site.id_, paths)
        elif isinstance(resource, SitemapIndex):
            for sitemap_dict in resource.sitemaps:
                url = sitemap_dict['loc']
                res_sitemap = await fetcher.fetch(url, site=site)
                if isinstance(res_sitemap, Sitemap):
                    paths, _ = extract_sitemap_paths(
                        site.base_url, res_sitemap.urls
                    )
                    await add_site_paths(conn, site.id_, paths)
        return False
    # handle TextResource
    relevant, is_new_resource = await _handle_text_resource(
        app, conn, tf, site, site_path, resource, url
    )
    if not relevant:
        return False
    site_path.resource_id = resource.id_
    site_path.canonical = resource.init_fields.get('canonical')
    site_path.ok_count += 1
    await site_path.save(conn)
    if shortlink_url := resource.init_fields.get('shortlink'):
        await _save_shortlink(
            conn, site, url, resource, shortlink_url, site_path.last_visit
        )
    return is_new_resource
 async def _handle_text_resource(
    app, conn, tf, site, site_path, resource, url
 ) -> tuple[bool, bool]:
    """
    Ingest a text resource.
    Return whether the resource is relevant and whether it is new.
    """
    # save the resource's internal links
    paths = []
    if links_int := resource.init_fields['links_int']:
        for durl, (rel, _) in links_int.items():
            rp_filter = app.plugins['filter_resource_path'].rp_filter
            if path := rp_filter(site, durl):
                canon = (rel and rel.lower() == 'canonical') or None
                paths.append((path, canon))
        await add_site_paths(conn, site.id_, paths)
    # find resources similar to the current text
    text = resource.search_fields['text']
    if len(text) < 300:  # discard resources with too short texts
        site_path.resource_id = None
        await site_path.save(conn)
        return False, False
    simhash = simhash_from_bigint(resource.simhash)
    index = site.simhash_index
    similar_ids = search_simhash(index, simhash)
    # determine the destination resource and resources to be merged into it
    old_id = site_path.resource_id
    if (
        old_id
        and old_id in similar_ids
        and (  # similar to old text
            dest_resource := await TextResource().load(conn, old_id)
        )
    ):
        merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
    else:  # no old text, or old text not similar any more
        if old_id:
            await site_path.unlink_resource(
                conn,
                app.search_engine,
                app.config['elasticsearch']['index_base_name'],
            )
        # find the first existing similar resource
        for similar_id in similar_ids:
            dest_resource = await TextResource().load(conn, similar_id)
            if dest_resource:
                # also require similar length
                l1 = len(resource.search_fields['text'])
                l2 = dest_resource.text_len
                if 0.95 * l2 <= l1 <= 1.05 * l2:
                    merge_ids = list(
                        filter(lambda elem: elem != similar_id, similar_ids)
                    )
                    break
        else:
            dest_resource = None
            merge_ids = []
    # update or create the destination resource
    if dest_resource:
        is_new_resource = False
        resource.simhash = create_simhash(index, dest_resource.id_, simhash)
        await dest_resource.update_from_resource(resource)
        resource = dest_resource
    else:
        is_new_resource = True
        resource.simhash = simhash_to_bigint(simhash)
        await resource.save(conn)
        create_simhash(index, resource.id_, simhash)
    # add resource to search index
    if resource.content_type in ('html', 'plain'):
        await index_resource(
            app.search_engine,
            tf,
            site_path,
            resource,
            site.base_url,
            url,
        )
    # merge resources: merge_ids -> resource
    for merge_id in merge_ids:
        # replace links to the merge resource with links to the dest resource
        sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
        await conn.execute(sql, resource.id_ or None, merge_id)
        # remove orphaned merge resource
        sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
        found = await conn.fetchval(sql, merge_id)
        if found:
            await delete_resource(
                app.search_engine,
                found[1],
                merge_id,
            )
    return True, is_new_resource
 async def _save_shortlink(
    conn, site, url, resource, shortlink_url, last_visit
 ):
    """
    Save a shortlink.
    """
    shortlink_durl = await Durl(shortlink_url, base=site.base_url)
    if shortlink_durl and shortlink_url != url:
        sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
        sl_path = shortlink_durl.pwa()
        row = await conn.fetchrow(sql, site.id_, sl_path)
        shortlink = await SitePath().load_from_row(row)
        if not shortlink:
            shortlink = SitePath(
                site_id=site.id_,
                path=sl_path,
                last_visit=last_visit,
                ok_count=1,
                canonical=False,
                resource_id=resource.id_,
            )
        else:
            shortlink.last_visit = last_visit
            shortlink.ok_count += 1
            shortlink.canonical = False
            shortlink.resource_id = resource.id_
        await shortlink.save(conn)
--- a/src/atextcrawler/resource/page.py
+++ b/src/atextcrawler/resource/page.py
@ -0,0 +1,355 @@
 """
 Parse HTML pages.
 """
 import logging
 from copy import deepcopy
 from typing import Optional, Union
 from bs4 import BeautifulSoup
 from tidylib import tidy_document
 from ..models import ResourceError, ResourceRedirect, Site, TextResource
 from ..utils.annotation import (
    annotate,
    annotations_remove_section,
    clean_annotations,
    get_tag_counts,
    headline_probability,
 )
 from ..utils.date_finder import extract_latest_date
 from ..utils.durl import Durl, assort_links
 from ..utils.html import (
    clean_body,
    clean_page,
    extract_title,
    get_html_lang,
    get_html_redirect,
 )
 from ..utils.http import get_header_links
 from ..utils.lang import extract_content_language
 from ..utils.section import iter_sections
 from ..utils.tag import keep_tags
 logger = logging.getLogger(__name__)
 logger_debug = logging.getLogger(__name__ + '.debug')
 logger_debug.setLevel(logging.INFO)
 logger_links = logging.getLogger(__name__ + '.debug.links')
 logger_stats = logging.getLogger(__name__ + '.debug.stats')
 logger_sections = logging.getLogger(__name__ + '.debug.sections')
 async def parse_html(
    durl: Durl,
    resp: dict,
    site: Optional[Site],
 ) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
    """
    Extract relevant data from a response returning a TextResource instance.
    The given URL must be the full URL (incl. scheme and netloc) of the page.
    """
    html = resp['content']
    # follow link to canonical URL
    header_links = await get_header_links(resp['headers'], durl, site)
    if canonical := header_links.get('canonical'):
        if canonical != durl.url():
            return ResourceRedirect(resp['redirects'] + [canonical])
    # follow html redirect, if present
    if redir_url := get_html_redirect(html):
        if redir_url not in resp['redirects']:
            return ResourceRedirect(resp['redirects'] + [redir_url])
        else:
            msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
            return ResourceError(msg)
    # require html tag
    if not html[:14].lower().startswith('<!doctype html'):
        if '<html' not in html:
            return None
    # real URL after redirection
    url = resp['redirects'][-1]
    durl = await Durl(url)
    if not durl:
        return None
    # page title
    title = extract_title(html)
    # tidy html
    try:
        html, _ = tidy_document(
            html.encode('utf-8'),
            options={
                'logical-emphasis': 1,
                'merge-divs': 1,
                'merge-spans': 1,
                'hide-comments': 1,
                'output-bom': 0,
                'show-errors': 0,
            },
        )
        html = html.decode('utf-8')
    except:
        msg = f'Cannot tidy html from {url}'
        return ResourceError(msg)
    # drop irrelevant tags, including their contents
    soup = clean_page(html)
    # extract shortlink (from http headers or html head)
    shortlink = header_links.get('shortlink')
    if not shortlink and soup.head:
        for link in soup.head.find_all('link'):
            if 'shortlink' in link.get('rel', ''):
                if link.get('href'):
                    shortlink = link.get('href')
                    break
    # language, plaintext, annotations, last change
    lang = get_html_lang(html)
    html = clean_body(str(soup.body))
    head = soup.head
    text, annotations = annotate(html)
    if lng := extract_content_language(text):
        lang = lng
    last_change = extract_latest_date(html, lang=lang)
    # assort internal and external links
    base_url = None
    if head and head.base:
        base_url = head.base.get('href')
    if not base_url and site:
        base_url = site.base_url
    cleaned_links, links_int, links_ext = await assort_links(
        annotations['links'], durl, text, base_url
    )
    annotations['links'] = cleaned_links
    if logger_links.isEnabledFor(logging.DEBUG):
        logger_links.debug('==== internal links')
        for durl_, txt in links_int.items():
            logger_links.debug(f'{durl_.url()} {txt}')
        logger_links.debug('==== external links')
        for durl_, txt in links_ext.items():
            logger_links.debug(f'{durl_.url()} {txt}')
    # keywords from category links
    category_links = set()
    for href, (i, f, rel) in annotations['links'].items():
        if rel and ('category' in rel or 'tag' in rel):
            category_links.add(text[i:f])
    keywords = sorted(category_links)
    # filter out irrelevant sections
    filtered_text, filtered_ann = filter_sections(
        text, annotations, site.boilerplate_texts if site else None
    )
    # debug statistics
    if logger_stats.isEnabledFor(logging.DEBUG):
        sb = annotations['semantic_breaks']
        fsb = filtered_ann['semantic_breaks']
        logger_stats.debug(
            f'Page statistics:'
            f' html_len={len(html)} text_len={len(filtered_text)}'
            f' ratio={len(filtered_text) / len(html):.2f};'
            f' sections={len(sb)} filtered_sections={len(fsb)}'
            f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
        )
    return TextResource(
        content_type='html',
        last_change=last_change,
        text_len=len(text),
        lang=lang,
        title=title,
        init_fields={
            'durl': durl,
            'site': site,
            'headers': resp['headers'],
            'redirects': resp['redirects'],
            'links_int': links_int,
            'links_ext': links_ext,
            'shortlink': shortlink,
            'canonical': True if canonical else None,
            'head': head,
        },
        search_fields={
            'title': title,
            'pub_date': last_change,
            'keywords': keywords,
            'text': filtered_text,
            'annotations': filtered_ann,
            'head': str(head),
        },
    )
 def filter_sections(text, annotations, boilerplate_texts):
    """
    Filter out irrelevant sections using scores and factoring in neighbors.
    """
    tags = annotations['tags']
    sb = annotations['semantic_breaks']
    section_ids = annotations['section_ids']
    # for i1,f1 in sorted(tags.keys()):
    #    print('           ', i1,f1,tags[(i1,f1)], text[i1:f1])
    # for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
    #    print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
    # print('_' * 50)
    # from pprint import pprint
    # pprint(sb)
    # pprint(tags)
    # pprint(section_ids)
    # calculate keep scores for sections
    # negative scores mean: drop; positive scores mean keep;
    # scores between -2 and 2 are undecided
    sections_keep = {}
    headline_probs = {}
    for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
        if prob := headline_probability(txt, tags[(i, f)], lvl):
            headline_probs[(i, f)] = prob
        w = 0
        n_chars = f - i - 1
        # string length
        w = (n_chars - 80) / 80  # initial weight
        # punctuation
        w += 0.4 * text.count('.') + 0.1 * text.count(',')
        # p tag
        if 'p' in tags[(i + 1, f)]:  # prefer keeping paragraphs
            w += 0.7
        # links
        n_links, link_density, avg_text_len = get_tag_counts(
            ('a',), i, f, tags, text
        )
        if link_density > 0.5:
            w = -n_links
        elif link_density > 0.3 and avg_text_len < 60:
            w = -3
        else:
            n_li, li_density, li_len = get_tag_counts(
                ('li',), i, f, tags, text
            )
            if link_density > 0.2 and li_density > 0.8 and li_len < 50:
                w = -3
        if 52 <= lvl < 60:
            w = max(w, 1.0)
        if 'sidebar' in ' '.join(section_ids.get(i, [])):
            w = -3
        if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
            w = -3
        # special chars
        if txt.startswith('←') or txt.endswith('→'):  # wordpress navigation
            w = -3
        # remove boilerplate texts
        if boilerplate_texts and txt in boilerplate_texts:
            w = -10
        sections_keep[(i, f)] = w, lvl
    # amend keep scores: look at preceding / subsequent sections with
    # equal level and transfer their keep scores to the current section
    n = len(sections_keep)
    sections = list(sorted(sections_keep.keys()))
    # inspect subsequent sections:
    for rev_ind, s_range in enumerate(reversed(sections)):
        ind = n - 1 - rev_ind
        w, lvl = sections_keep[s_range]
        if abs(w) <= 2:
            w_sum = 0
            n_peers = 0
            for i in range(ind + 1, min(n, ind + 15)):
                w_, lvl_ = sections_keep[sections[i]]
                if lvl_ != lvl:
                    break
                n_peers += 1
                w_sum += w_
            if n_peers >= 3:
                sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
    # inspect preceding sections:
    for ind, s_range in enumerate(sections):
        w, lvl = sections_keep[s_range]
        if abs(w) <= 2:
            w_sum = 0
            n_peers = 0
            for i in range(ind - 1, max(0, ind - 15), -1):
                w_, lvl_ = sections_keep[sections[i]]
                if lvl_ != lvl:
                    break
                n_peers += 1
                w_sum += w_
            if n_peers >= 3:
                sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
    # amend keep scores: look at sections that could be headlines
    # for subsequent kept sections and increase their score;
    # also allow for up to 2 sections inbetween (which will also
    # have their score increased)
    for rev_ind, s_range in enumerate(reversed(sections)):
        ind = n - 1 - rev_ind
        w, lvl = sections_keep[s_range]
        if abs(w) <= 2:
            if headline_probs.get(s_range, 0) > 0.49:
                # look at subsequent sections with higher level
                child_weights = []
                for i in range(ind + 1, n):
                    w_, lvl_ = sections_keep[sections[i]]
                    if lvl_ <= lvl or w_ < -2:
                        break
                    child_weights.append(w_)
                if nc := len(child_weights):
                    child_avg = sum(child_weights) / nc
                    if w + 1.2 * child_avg > 2:
                        sections_keep[s_range] = w + 1.2 * child_avg, lvl
                        if nc > 1:
                            if (w1 := child_weights[0]) <= 2:
                                sections_keep[sections[ind + 1]] = (
                                    w1 + 1.5 * child_avg,
                                    lvl,
                                )
                        if nc > 2:
                            if (w2 := child_weights[1]) <= 2:
                                sections_keep[sections[ind + 2]] = (
                                    w2 + 2 * child_avg,
                                    lvl,
                                )
    # clean annotations
    clean_annotations(annotations)
    # debug sections
    if logger_sections.isEnabledFor(logging.DEBUG):
        logger_sections.debug('============= Weighted sections =============')
        for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
            w, lvl = sections_keep[(i, f)]
            indent = ('+' if w > 2 else '-') * lvl
            ts = ','.join(tags[(i + 1, f)])
            logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
    # narrow down annotations and text to keep_sections
    # drop undecided sections
    filtered_text = text
    filtered_ann = deepcopy(annotations)
    for i, f in sorted(sections_keep.keys(), reverse=True):
        w, lvl = sections_keep[(i, f)]
        if w <= 2.0:
            filtered_ann = annotations_remove_section(filtered_ann, i, f)
            filtered_text = filtered_text[:i] + filtered_text[f:]
    clean_annotations(filtered_ann)
    # debug filtered sections
    if logger_sections.isEnabledFor(logging.DEBUG):
        logger_sections.debug('')
        logger_sections.debug('============= Filtered sections =============')
        fsb = filtered_ann['semantic_breaks']
        ftags = filtered_ann['tags']
        for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
            indent = ' ' * lvl
            ts = ','.join(ftags.get((i + 1, f), []))
            logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
    return filtered_text, filtered_ann
--- a/src/atextcrawler/resource/plaintext.py
+++ b/src/atextcrawler/resource/plaintext.py
@ -0,0 +1,148 @@
 """
 Parse plaintext pages.
 """
 import logging
 import re
 from typing import Any, Optional, Union
 import pypandoc
 from ..models import ResourceError, ResourceRedirect, Site, TextResource
 from ..utils.annotation import annotate
 from ..utils.date_finder import extract_latest_date
 from ..utils.durl import Durl
 from ..utils.http import get_header_links
 from ..utils.lang import extract_content_language
 from ..utils.muse import parse_muse
 logger = logging.getLogger(__name__)
 MAX_LINK_TEXT_LENGTH = 100
 """
 Maximum length of a link's text to be kept.
 Cf. table site_link, column link_text.
 """
 re_url = re.compile(
    r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
    r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
 )
 re_nl = re.compile(r'\r\n')
 re_ws = re.compile(r'\s*\n\s*\n\s*')
 re_nn = re.compile(r'\n\n')
 async def parse_plaintext(
    durl: Durl,
    resp: dict,
    site: Optional[Site],
 ) -> Optional[Union[ResourceRedirect, TextResource]]:
    """
    Extract relevant data from a response returning a TextResource instance.
    The given URL must be the full URL (incl. scheme and netloc) of the page.
    """
    text = resp['content']
    # HTTP headers, canonical URL, shortlink
    header_links = await get_header_links(resp['headers'], durl, site)
    if canonical := header_links.get('canonical'):
        if canonical != durl.url():
            return ResourceRedirect(resp['redirects'] + [canonical])
    shortlink = header_links.get('shortlink')
    if not text:
        return None
    text = re_nl.sub('\n', text)
    text = re_ws.sub('\n\n', text)
    # meta info
    meta: dict[str, Any] = {}
    muse = None
    if durl.path.endswith('.muse'):
        muse = parse_muse(text)
        if muse:
            meta, text = muse
    # title
    if not meta.get('title'):
        meta['title'] = text[:200].splitlines()[0]
    # content language
    if not meta.get('lang'):
        meta['lang'] = extract_content_language(text)
    # publication date
    if not meta.get('pub_date'):
        meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
    # links
    links_int: dict[Durl, tuple[list[str], str]] = {}
    links_ext: dict[Durl, tuple[list[str], str]] = {}
    for url in re_url.findall(text):
        link_durl = await Durl(url[0])
        if link_durl:
            if link_durl.site() == durl.site():
                links_int[link_durl] = [], link_durl.url()
            else:
                links_ext[link_durl] = [], link_durl.url()
    if muse:
        html = pypandoc.convert_text(text, 'html5', format='muse').strip()
        text, annotations = annotate(html)
    else:
        text, annotations = annotate_text(text)
    return TextResource(
        content_type=resp['parser'],
        last_change=meta.get('pub_date'),
        text_len=len(text),
        lang=meta.get('lang'),
        title=meta.get('title'),
        init_fields={
            'durl': durl,
            'site': site,
            'headers': resp['headers'],
            'redirects': resp['redirects'],
            'links_int': links_int,
            'links_ext': links_ext,
            'shortlink': shortlink,
            'canonical': None,
        },
        search_fields={
            'title': meta.get('title'),
            'authors': meta.get('authors'),
            'pub_date': meta.get('pub_date'),
            'keywords': meta.get('keywords'),
            'summary': meta.get('summary'),
            'text': text,
            'annotations': annotations,
        },
    )
 def annotate_text(text):
    """
    Return annoations as :func:`utils.annotation.annotate`does.
    Here we only have information on semantic breaks
    (in plaintext they are where empty lines are).
    """
    semantic_breaks = {}
    for match in re_nn.finditer(text):
        semantic_breaks[match.span()[0]] = ''
    annotations = {
        'tags': {},
        'semantic_breaks': semantic_breaks,
        'section_ids': {},
        'links': {},
    }
    return text, annotations
--- a/src/atextcrawler/resource/sitemap.py
+++ b/src/atextcrawler/resource/sitemap.py
@ -0,0 +1,149 @@
 """
 Sitemap and SitemapIndex and related operations.
 """
 import logging
 from datetime import datetime
 from typing import Optional
 import pytz
 from ..models import Sitemap, SitemapIndex, TextResource
 logger = logging.getLogger(__name__)
 async def get_sitemap_urls(
    fetcher,
    base_url: Optional[str],
    sitemaps=None,
 ) -> list[dict]:
    """
    Try to find sitemaps and fetch and return their URL content.
    Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
    """
    if sitemaps:
        # test example: https://www.berlin.de/
        check_all = True
    elif base_url:
        sitemaps = [
            base_url.rstrip('/') + '/sitemap.xml',
            base_url.rstrip('/') + '/wp-sitemap.xml',
            base_url.rstrip('/') + '/sitemap_index.xml',
            base_url.rstrip('/') + '/sitemap.xml.gz',
            base_url.rstrip('/') + '/sitemap_index.xml.gz',
            base_url.rstrip('/') + '/sitemap.txt',
            base_url.rstrip('/') + '/sitemap/',
            base_url.rstrip('/') + '/sitemap1.xml',
            base_url.rstrip('/') + '/sitemap-index.xml',
            base_url.rstrip('/') + '/sitemapindex.xml',
            base_url.rstrip('/') + '/sitemap/index.xml',
        ]
        check_all = False
    else:
        return []
    urls = []
    for sitemap in sitemaps:
        resource = await fetcher.fetch(sitemap)
        found = True
        if isinstance(resource, SitemapIndex):
            for sitemap_ in resource.sitemaps:
                sitemaps.append(sitemap_['loc'])
        elif isinstance(resource, Sitemap):
            urls += resource.urls
        elif isinstance(resource, TextResource) and resource.content_type in (
            'html',
            'plain',
        ):
            urls += [
                {'loc': durl.url()}
                for durl in resource.init_fields['links_int']
            ]
        else:
            found = False
        if found and not check_all:
            break
    return urls
 def parse_sitemapindex(sitemapindex):
    """
    Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
    """
    sitemaps = []
    for tag in sitemapindex.find_all('sitemap'):
        if loc := tag.find('loc'):
            if loc.string:
                sitemap = {'loc': loc.string.strip()}
                if lastmod := tag.find('lastmod'):
                    try:
                        t = datetime.fromisoformat(lastmod.string.strip())
                        sitemap['lastmod'] = t
                    except:
                        pass
                sitemaps.append(sitemap)
    return SitemapIndex(sitemaps=sitemaps)
 def parse_sitemap(urlset) -> Sitemap:
    """
    Return a list of sitemap URLs.
    Each URL is a dict with these keys+values:
      * loc: the full URL of a mapped resource
      * lastmod: optional datetime of its last modification
      * changefreq: optional info on the change frequency to be expected
      * priority: optional info on its priority relative to other resources
    Cf. https://www.sitemaps.org/protocol.html
    """
    urls = []
    for tag in urlset.find_all('url'):
        if loc := tag.find('loc'):
            if loc.string:
                url = {'loc': loc.string.strip()}
                if lastmod := tag.find('lastmod'):
                    try:
                        t = lastmod.string.strip().rstrip('Z')
                        url['lastmod'] = (
                            datetime.fromisoformat(t)
                            .astimezone(pytz.utc)
                            .replace(tzinfo=None)
                        )
                    except:
                        pass
                if changefreq := tag.find('changefreq'):
                    url['changefreq'] = changefreq.string.strip()
                if priority := tag.find('priority'):
                    url['priority'] = priority.string.strip()
                urls.append(url)
    return Sitemap(urls=urls)
 def extract_sitemap_paths(
    base_url: Optional[str],
    urls: list[dict],
 ) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
    """
    Extract essential information from sitemap URLs.
    Return a list of relative paths of the site's resources
    (in a form to be easily fed into `add_site_paths`) and
    the datetime of the latest change.
    Relative paths are computed using base_url.
    """
    paths = []
    latest = None
    for url in urls:
        loc = url['loc']
        lastmod = url.get('lastmod')
        if loc.startswith(base_url or ''):
            path = loc.removeprefix(base_url or '').lstrip('/')
            path = path.split('#', 1)[0]
            paths.append((path, True))
            if lastmod:
                latest = max(lastmod, latest or lastmod)
    return paths, latest
--- a/src/atextcrawler/search/init.py
+++ b/src/atextcrawler/search/init.py
@ -0,0 +1,6 @@
 from .engine import (
    delete_resource,
    index_resource,
    shutdown_engine,
    startup_engine,
 )
--- a/src/atextcrawler/search/engine.py
+++ b/src/atextcrawler/search/engine.py
@ -0,0 +1,270 @@
 """
 Search engine, for now elasticsearch.
 We have one index per supported language and a default one.
 """
 import logging
 import warnings
 from difflib import SequenceMatcher
 from typing import Union
 from elasticsearch import AsyncElasticsearch
 from elasticsearch.exceptions import NotFoundError
 from ..utils.annotation import pack_annotations
 from ..utils.section import concat_section_texts
 logger = logging.getLogger(__name__)
 warnings.filterwarnings(
    'ignore',
    'The client is unable to verify that the'
    ' server is Elasticsearch due security privileges on the server side',
 )
 MIN_INDEXING_TIMEOUT_SECONDS = 5
 language_analyzers = {
    'en': 'english',
    'de': 'german',
    #'fr': 'french',
    #'el': 'greek',
    #'es': 'spanish',
    'default': 'standard',
 }
 properties = {
    'resource_id': {'type': 'long'},
    'site_id': {'type': 'long'},
    'url': {'type': 'text'},
    'base_url': {'type': 'text'},
    'pub_date': {'type': 'date', 'format': 'yyyy-MM-dd||yyyy-MM||yyyy'},
    'lang': {'type': 'keyword'},
    'title': {'type': 'text'},
    'authors': {'type': 'text'},
    'summary': {'type': 'text'},
    'keywords': {'type': 'text'},
    'collections': {'type': 'keyword'},
    'time_horizon': {'type': 'keyword'},
    'orig_source': {'type': 'text'},
    'topics': {'type': 'text'},
    'annotations': {'type': 'text', 'index': False},
    'sections': {
        'type': 'nested',
        'properties': {
            'start_ids': {'type': 'integer'},
            'end_ids': {'type': 'integer'},
            'text': {'type': 'text', 'index_options': 'offsets'},
            'embedding': {'type': 'dense_vector', 'dims': 512},
        },
    },
 }
 async def startup_engine(config):
    """
    Open the search engine for access.
    """
    engine = AsyncElasticsearch(
        host=config['elasticsearch']['host'],
        api_key=(
            config['elasticsearch']['id'],
            config['elasticsearch']['api_key'],
        ),
        use_ssl=False,
        timeout=20,
    )
    engine.index_base_name = config['elasticsearch']['index_base_name']
    await create_indices(engine)
    await open_indices(engine)
    return engine
 async def create_indices(engine):
    """
    Create indices for all configured langiages.
    """
    for lang, analyzer in language_analyzers.items():
        index_name = engine.index_base_name + '_text_' + lang
        if not await engine.indices.exists(index=index_name):
            await engine.indices.create(index=index_name)
        await engine.indices.close(index=index_name)
        await engine.indices.put_settings(
            index=index_name,
            body={
                'analysis': {'analyzer': {'default': {'type': analyzer}}},
                'refresh_interval': '60s',
            },
        )
        await engine.indices.put_mapping(
            index=index_name,
            body={'properties': properties},
        )
 async def open_indices(engine):
    """
    Open indices for all configure languages.
    """
    for lang in language_analyzers.keys():
        index_name = engine.index_base_name + '_text_' + lang
        await engine.indices.open(index=index_name)
 async def shutdown_engine(engine):
    """
    Close the connection to the search engine.
    """
    # await close_indices(engine)
    await engine.close()
 async def close_indices(engine):
    """
    Close indices. UNUSED.
    """
    for lang in language_analyzers.keys():
        index_name = engine.index_base_name + '_text_' + lang
        await engine.indices.close(index=index_name)
 async def index_resource(
    engine,
    tf,
    site_path,
    resource,
    base_url,
    url,
 ):
    """
    Index a resource.
    """
    lang = resource.lang
    index_lang = lang if lang in language_analyzers.keys() else 'default'
    index_name = engine.index_base_name + '_text_' + index_lang
    pub_date = resource.search_fields.get('pub_date')
    if pub_date:
        pub_date = str(pub_date.date())
    text = resource.search_fields.get('text')
    annotations = resource.search_fields.get('annotations')
    semantic_breaks = annotations['semantic_breaks']
    sections = []
    for section_ids, txt in concat_section_texts(text, semantic_breaks):
        embedding = await tf.embed(txt)
        sections.append(
            {
                'start_ids': section_ids[0],
                'end_ids': section_ids[-1],
                'text': txt,
                'embedding': embedding,
            }
        )
    doc = {
        'resource_id': resource.id_,
        'site_id': site_path.site_id,
        'url': url,
        'base_url': base_url,
        'pub_date': pub_date,
        'lang': resource.lang,
        'title': resource.search_fields.get('title'),
        'authors': resource.search_fields.get('authors'),
        'summary': resource.search_fields.get('summary'),
        'keywords': resource.search_fields.get('keywords'),
        'collections': resource.search_fields.get('collections'),
        'time_horizon': resource.search_fields.get('time_horizon'),
        'orig_source': resource.search_fields.get('orig_source'),
        'topics': resource.search_fields.get('topics'),
        'annotations': pack_annotations(annotations),
        'sections': sections,
    }
    timeout_seconds = max(MIN_INDEXING_TIMEOUT_SECONDS, int(len(text) / 1000))
    await engine.index(
        id=resource.id_,
        index=index_name,
        body=doc,
        timeout=f'{timeout_seconds}s',
    )
 async def delete_resource(engine, lang, resource_id):
    """
    Delete a resource.
    """
    index_name = engine.index_base_name + '_text_' + (lang or 'default')
    try:
        await engine.delete(index_name, resource_id)
    except NotFoundError:
        msg = f'Cannot delete resource from index, not found: {resource_id}'
        logger.warning(msg)
 async def find_duplicate(engine, site_id, resource) -> Union[bool, None, int]:
    """
    UNUSED.
    Try to find a duplicate resource with matching site.
    If the search backend query fails, return False.
    If no matching resource was found, return None.
    If a matching resource was found, return its id.
    """
    # get sample texts
    text = resource.search_fields['text']
    if not text or len(text) < 100:
        return None
    #  annotations = resource.search_fields['annotations']
    #  semantic_breaks = annotations['semantic_breaks']
    #  texts = []
    #  for _, txt in concat_section_texts(text, semantic_breaks):
    #      texts.append(txt)
    #  texts = extract_samples(texts)
    #  # search for sample texts
    #  text_count = len(texts)
    #  should_min = max(1, int(0.6 * text_count))
    #  should = []
    #  for text in texts:
    #      should.append({'match': {'sections.text': text}})
    query = {
        'bool': {
            'must': {
                'nested': {
                    'path': 'sections',
                    'query': {'match': {'sections.text': text}},
                },
            },
            'filter': {
                'term': {
                    'site_id': site_id,
                },
            },
        }
    }
    fields = [
        'url',
        'sections.text',
        'site_id',
    ]
    response = await engine.search(
        index=engine.index_base_name + '_text_*',
        body={
            'query': query,
            'fields': fields,
            'from': 0,
            'size': 3,
            '_source': False,
        },
    )
    if response['timed_out']:
        return False
    for hit in response.get('hits', {}).get('hits'):
        txt = ' '.join(hit['fields']['sections.text'])
        similarity = SequenceMatcher(None, text, txt).ratio()
        if similarity > 0.99:
            return hit['_id']
    return None
--- a/src/atextcrawler/site/init.py
+++ b/src/atextcrawler/site/init.py
@ -0,0 +1,9 @@
 """
 Websites.
 """
 from .feeds import fetch_feeds
 from .operations import checkin_site, checkout_site, process_site, update_site
 from .queue import process_site_queue
 from .robots import RobotsInfo
 from .seed import load_seeds
--- a/src/atextcrawler/site/main.py
+++ b/src/atextcrawler/site/main.py
@ -0,0 +1,68 @@
 """
 Tool for analyzing a website.
 Fetch the startpage and output information to console.
 Do not change any persistent data.
 """
 import asyncio
 import logging
 import sys
 import aiohttp
 from ..models import TextResource
 from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
 from ..site.robots import RobotsInfo
 from ..utils.durl import Durl
 from .parse import parse_startpage
 logger = logging.getLogger()
 logger.setLevel(logging.WARNING)
 logger.addHandler(logging.StreamHandler())
 async def run():
    """
    Fetch the startpage of a website and show information about it.
    The URL must be given as commandline argument.
    """
    base_url = sys.argv[1]
    async with aiohttp.ClientSession() as session:
        if not (base_durl := await Durl(base_url)):
            return
        fetcher = ResourceFetcher(session)
        resource = await fetcher.fetch(base_url)
        logger.warning(repr(resource))
        if (
            isinstance(resource, TextResource)
            and resource.content_type == 'html'
        ):
            site = await parse_startpage(resource)
            # site.crawl_enabled = await site_filter(site)
            logger.warning(repr(site))
            logger.warning('')
            for durl, text in site.links_ext.items():
                logger.warning(f'                  {durl} {text}')
                logger.warning(f'{durl.url()} -------- {text}')
            logger.warning('')
            logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
            logger.warning('')
            robots = await RobotsInfo(base_url)
            urls = await get_sitemap_urls(
                fetcher, base_url, sitemaps=robots.site_maps
            )
            paths, latest = extract_sitemap_paths(base_url, urls)
            for path in paths:
                logger.warning(path)
            logger.warning(f'Feeds: {site.feeds}')
            logger.warning(latest)
            # sample_links = extract_samples(resource.init_fields['links_int'])
            # logger.warning(f'************* {sample_links}')
        else:
            logger.warning('(No text resource or error.)')
 if __name__ == '__main__':
    asyncio.run(run())
--- a/src/atextcrawler/site/feeds.py
+++ b/src/atextcrawler/site/feeds.py
@ -0,0 +1,100 @@
 """
 High-level feed-related stuff.
 See resource.feed for low-level stuff not primarily related to sites.
 """
 from datetime import datetime
 from typing import Optional
 from ..models import Feed
 from ..resource import store_feed_entries, update_feed
 async def store_new_feeds(conn, site_id, feeds: dict):
    """
    Store new feeds in table site_feed.
    """
    sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1"
    known_feeds = (await conn.fetchval(sql, site_id)) or []
    for feed_url in feeds.keys():
        if feed_url not in known_feeds:
            feed = Feed(
                site_id=site_id,
                url=feed_url,
            )
            await feed.save(conn)
 async def get_feeds(conn, site_id) -> list[Feed]:
    """
    Return stored feeds for the given site.
    """
    sql = "SELECT * FROM site_feed WHERE site_id=$1"
    rows = (await conn.fetch(sql, site_id)) or []
    return [(await Feed().load_from_row(row)) for row in rows]
 async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]:
    """
    Fetch feeds, add new resources and return the latest content update time.
    """
    feeds = await get_feeds(conn, site.id_)
    latest = None
    for feed in feeds:
        feed_content = await update_feed(fetcher, feed, conn)
        if feed_content:
            await store_feed_entries(conn, site, feed_content)
            if feed.t_content:
                latest = max(latest or feed.t_content, feed.t_content)
    return latest
 if __name__ == '__main__':
    # only use this on a dev instance!
    import asyncio
    import logging
    import sys
    import aiohttp
    from ..config import Config
    from ..db import PGPool
    from ..resource.fetch import ResourceFetcher
    from .operations import process_site, update_site
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    config = Config().get()
    url = sys.argv[1]
    async def run():
        """
        Fetch and display a site.
        """
        app = None  # TODO
        async with PGPool(config['postgresql']) as pool:
            async with pool.acquire() as conn:
                async with aiohttp.ClientSession() as session:
                    fetcher = ResourceFetcher(session)
                    site, _ = await update_site(app, fetcher, conn, url)
                    logger.warning(site)
                    await process_site(fetcher, conn, site)
                    latest = await fetch_feeds(fetcher, conn, site)
                    logger.warning(f'latest: {latest}')
                    # feed = Feed(url=url)
                    # feed_content = await update_feed(fetcher, feed, conn)
                    # if isinstance(feed_content, ResourceError):
                    #    print(feed_content)
                    # else:
                    #    print(feed)
                    #    pprint(feed_content[0])
                    #    print('---- 2nd try ----')
                    #    feed_content = await update_feed(fetcher, feed, conn)
                    #    if isinstance(feed_content, ResourceError):
                    #        print(feed_content)
                    #    else:
                    #        print(feed)
                    #        pprint(feed_content[0])
    asyncio.run(run())
--- a/src/atextcrawler/site/operations.py
+++ b/src/atextcrawler/site/operations.py
@ -0,0 +1,267 @@
 """
 Operations on sites.
 """
 import logging
 from datetime import datetime, timedelta
 from typing import Optional
 from asyncpg import Connection
 from ..models import Crawl, Site, TextResource
 from ..resource import (
    add_site_paths,
    extract_sitemap_paths,
    get_sitemap_urls,
    store_boilerplate_texts,
 )
 from ..utils.durl import Durl
 from ..utils.similarity import get_simhash_index
 from .feeds import fetch_feeds, store_new_feeds
 from .parse import parse_startpage
 from .robots import RobotsInfo
 logger = logging.getLogger(__name__)
 async def checkout_site(
    app, conn: Connection
 ) -> tuple[Optional[int], bool, bool]:
    """
    Get the id of a site to be crawled and mark it with crawl_active=true.
    Also return whether the site shall be fully crawled; if not, this
    means that just the resources from the feeds shall be crawled.
    Also return whether more sites might be available.
    """
    async with conn.transaction():
        sql = (
            "SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
            " FROM site WHERE crawl_enabled AND crawl_active = false"
            " AND (next_full_crawl < now() at time zone 'UTC'"
            " OR next_feed_crawl < now() at time zone 'UTC')"
            " LIMIT 1 FOR UPDATE SKIP LOCKED"
        )
        row = await conn.fetchrow(sql)
        if row:
            site_id = row['id']
            is_full = row['is_full']
            sql = "UPDATE site SET crawl_active = true WHERE id=$1"
            await conn.execute(sql, site_id)
            site = await Site().load(conn, site_id)
            if site:
                site.base_durl = await Durl(site.base_url)
                if site.base_durl:
                    site.simhash_index = await get_simhash_index(conn, site_id)
                    return site, is_full, True
                else:
                    # site not available; schedule next crawl
                    int_full = app.config['crawl']['full_crawl_interval']
                    int_feed = app.config['crawl']['feed_crawl_interval']
                    now = datetime.utcnow()
                    t_full = now + timedelta(seconds=int_full)
                    t_feed = now + timedelta(seconds=int_full + int_feed)
                    sql = (
                        "UPDATE site SET crawl_active=false,"
                        " next_full_crawl=$1, next_feed_crawl=$2"
                        " WHERE id=$3"
                    )
                    await conn.execute(sql, t_full, t_feed, site_id)
                    return None, False, True
            return None, False, True
    return None, False, False
 async def update_site(
    app, fetcher, conn: Connection, base_url, site: Site = None
 ) -> tuple[Optional[Site], bool]:
    """
    Try to fetch base_url and return a site and whether a new one was created.
    This function is run for all sites (including blacklisted and irrelevant
    ones). It determines whether the site shall be crawled.
    If an errors occurs, return (None, False), and if a site was given,
    also set it to crawl_enabled=False and remove crawling schedules.
    If base_url could be fetched, update the site, possibly creating
    a new one.
    If the site has crawl_enabled, and no full crawl is scheduled,
    schedule one (by updating column `next_full_crawl`).
    """
    # fetch startpage
    logger.info(f'Updating site={site}, base_url={base_url}')
    resource = await fetcher.fetch(base_url, site=site)
    if (
        not isinstance(resource, TextResource)
        or resource.content_type != 'html'
    ):
        if site:
            site.meta_info['error'] = 'Invalid start page'
            site.crawl_enabled = False
            site.next_full_crawl = None
            site.next_feed_crawl = None
            await site.save(conn)
        logger.info(f'Failed startpage {base_url}: {resource}')
        return None, False
    # parse startpage (extract site information) and save the site
    site = await parse_startpage(resource, app=app, site=site)
    site_id, created = await site.save(conn)
    if created:
        logger.debug(f'Created {site}')
    # add black-/white-listing info
    is_allowed = await is_site_allowed(conn, site.id_, base_url)
    if is_allowed is not None and is_allowed != site.crawl_enabled:
        site.crawl_enabled = is_allowed
        await site.save(conn)
    # schedule full crawl, if none is scheduled and the site shall be crawled
    if site.crawl_enabled:
        sql = (
            "UPDATE site"
            " SET next_full_crawl=now() at time zone 'UTC'"
            " WHERE id=$1 AND next_full_crawl IS null"
        )
        await conn.execute(sql, site_id)
    return site, created
 async def is_site_allowed(
    conn: Connection,
    site_id: Optional[int],
    base_url: str,
 ) -> Optional[bool]:
    """
    Return True if the site is whitelisted, False if blacklisted, else None.
    Also add missing site_ids to the annotations.
    """
    sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
    anns = await conn.fetch(sql, site_id, base_url)
    for ann in anns:
        if ann['ann_type'] == 'blacklist':
            return False
        if ann['ann_type'] == 'whitelist':
            return True
    # add missing site_ids
    if site_id and any([ann['site_id'] is None for ann in anns]):
        sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
        await conn.execute(sql, site_id, base_url)
    return None
 async def process_site(fetcher, conn: Connection, site: Site):
    """
    Process a site: fetch and store more information.
    Store external and internal links; find boilerplate texts;
    fetch sitemaps; fetch feeds; update date of last publication.
    """
    if not site.id_:  # only to satisfy typing
        return
    if site.links_ext:
        await _store_cross_site_links(conn, site.id_, site.links_ext)
    if site.links_int:
        paths = []
        for durl, (rel, _) in site.links_int.items():
            canon = (rel and rel.lower() == 'canonical') or None
            paths.append((durl.pwa(), canon))
        await add_site_paths(conn, site.id_, paths)
    await store_boilerplate_texts(fetcher, conn, site)
    # get sitemaps and add their resources
    robots = await RobotsInfo(site.base_url)  # type: ignore
    urls = await get_sitemap_urls(
        fetcher, site.base_url, sitemaps=robots.site_maps
    )
    paths_, latest = extract_sitemap_paths(site.base_url, urls)
    await add_site_paths(conn, site.id_, paths_)
    # store feeds and their resources
    await store_new_feeds(conn, site.id_, site.feeds)
    latest_ = await fetch_feeds(fetcher, conn, site)
    if latest_:
        latest = max(latest or latest_, latest_)
    # update last_pub
    if latest:
        site.last_pub = latest
    await site.save(conn)
 async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
    """
    Unlock the site and schedule next crawl.
    *crawl* is the crawl that has just finished (regularly or stopped).
    If the crawl was stopped (t_end is None), just unlock the site.
    Otherwise schedule a crawl of the same type. After a full crawl
    also a feed crawl is scheduled, if there was none scheduled.
    """
    if crawl.t_end is None:
        sql = "UPDATE site SET crawl_active=false WHERE id=$1"
        await conn.execute(sql, site.id_)
    elif crawl.is_full:
        full_interval = app.config['crawl']['full_crawl_interval']
        feed_interval = app.config['crawl']['feed_crawl_interval']
        next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
        next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
        sql = (
            "UPDATE site SET crawl_active=false, next_full_crawl=$1,"
            " next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
        )
        await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
    else:
        feed_interval = app.config['crawl']['feed_crawl_interval']
        next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
        sql = (
            "UPDATE site SET crawl_active=false, next_feed_crawl=$1"
            " WHERE id=$2"
        )
        await conn.execute(sql, next_feed_crawl, site.id_)
 async def _store_cross_site_links(
    conn: Connection,
    site_id: int,
    links: dict[Durl, tuple[list[str], str]],
 ) -> None:
    """
    Put outgoing links into site_link/site_queue for existing/unknown sites.
    Separate outgoing links from *site_id* into two classes:
    (a) existing sites (rows in table site) and (b) unknown links.
    Add links from class (a) to table site_link.
    Add links from class (b) to table site_queue.
    """
    # add outgoing cross-site links for existing sites to table site_link
    urls = [url.site() for url in links.keys()]
    values = []
    sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
    if rows := await conn.fetch(sql, urls):
        for row in rows:
            if (durl := await Durl(row['url'])) in links.keys():
                _, link_text = links.pop(durl)
                if site_id != row['id']:
                    values.append((site_id, row['id'], link_text))
    sql = (
        "INSERT INTO site_link (src, dst, link_text)"
        " VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
    )
    await conn.executemany(sql, values)
    # add outgoing cross-site links for unknown sites to table site_queue
    sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
    values = [
        (site_id, durl.site()[:200], link_text[:100])
        for durl, (_, link_text) in links.items()
    ]
    await conn.executemany(sql, values)
--- a/src/atextcrawler/site/parse.py
+++ b/src/atextcrawler/site/parse.py
@ -0,0 +1,255 @@
 """
 Parsing of a site's startpage.
 """
 import re
 from datetime import datetime
 from typing import Any, Optional
 from ..models import Site, TextResource
 from ..resource import feed_types
 from ..utils.durl import Durl, get_ips
 from ..utils.html import clean_html
 from ..utils.lang import clean_lang
 from ..utils.link import (
    extract_domain,
    in_blacklist,
    link_rels,
    meta_names,
    meta_props,
 )
 re_meta_keyword_sep = re.compile('[,;\r\n]')
 def cut_str(s: Optional[str], l: int) -> Optional[str]:
    """
    Cut a string *s* to a maximal length *l* from the left.
    """
    return s[:l] if s else None
 async def parse_startpage(
    startpage: TextResource, app=None, site=None
 ) -> Site:
    """
    Parse a site's startpage and return a Site instance.
    If a site instance is given, update it.
    """
    durl = startpage.init_fields['durl']
    soup = startpage.init_fields['head']
    meta = collect_meta_tags(soup)
    meta_links = await collect_meta_links(soup, durl)
    links_ext = await collect_external_links(startpage, meta_links)
    links_int = startpage.init_fields['links_int']
    langs = extract_languages(startpage, meta, meta_links)
    title, description, keywords = extract_meta_texts(startpage, meta)
    # feeds
    feeds = meta_links['feeds']
    if 'wordpress' in meta.get('generator', '').lower():
        url = durl.site() + 'feed/'
        feeds[url] = 'application/rss+xml'
    # TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
    # network params (canonical_url, base_urls, domains)
    ips = await get_ips(durl.hostname)
    redirects = []
    for redirect in startpage.init_fields['redirects']:
        redir_url = await Durl(redirect)
        if redir_url:
            redirects.append(redir_url.site())
    base_urls = redirects + [durl.url()]
    domains = [extract_domain(durl.hostname)]
    if site:  # update an existing Site
        site.canonical_url = meta_links['canonical_url'] or site.canonical_url
        site.base_urls = base_urls
        site.domains = domains
        site.ips = ips
        site.last_update = datetime.utcnow()
        site.last_pub = startpage.last_change
        site.langs = langs
        site.alt_langs = meta_links['alt_langs']
        site.title = title
        site.description = description
        site.keywords = keywords
        site.linkbacks.update(meta_links['linkbacks'])
        site.meta_info = meta
        site.__post_init__(
            base_durl=durl,
            feeds=feeds,
            links_ext=links_ext,
            links_int=links_int,
            startpage_text=startpage.search_fields['text'],
        )
    else:  # create new Site instance
        site = Site(
            # post_init fields
            base_durl=durl,
            feeds=feeds,
            links_ext=links_ext,
            links_int=links_int,
            startpage_text=startpage.search_fields['text'],
            # dataclass fields
            canonical_url=meta_links['canonical_url'],
            base_urls=base_urls,
            domains=domains,
            ips=ips,
            last_update=datetime.utcnow(),
            last_pub=startpage.last_change,
            langs=list(langs),
            alt_langs=meta_links['alt_langs'],
            title=title,
            description=description,
            keywords=keywords,
            linkbacks=meta_links['linkbacks'],
            meta_info=meta,
        )
    if site.ips is None and site.url:
        site.ips = await get_ips(site.url.hostname)
    if app and site.startpage_text:
        site_filter = app.plugins['filter_site'].site_filter
        site.crawl_enabled = await site_filter(site)
    return site
 def collect_meta_tags(soup):
    """
    Collect selected meta tags (meta_names and meta_props) with their values.
    """
    meta = {}
    for tag in soup.find_all('meta'):
        if (name := tag.get('name')) and name in meta_names:
            meta[name] = tag.get('content')
        if (property := tag.get('property')) in meta_props:
            if content := tag.get('content'):
                meta[property] = content
        if tag.get('http-equiv') == 'content-language':  # old html
            if content := tag.get('content'):
                meta['http_equiv_lang'] = content
    return meta
 async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
    """
    Collect link tags with site scope (feeds, linkbacks, canonical, ...).
    """
    linkbacks = {}
    feeds = {}
    alt_langs = {}
    canonical_url = None
    for tag in soup.find_all('link'):
        if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
            continue
        if not (url := tag.get('href')):
            continue
        if not (link_durl := await Durl(url, base=base_durl)):
            continue
        if in_blacklist(link_durl.hostname):
            continue
        link_url = link_durl.url()
        link_type = tag.get('type')
        if link_type in feed_types:
            feeds[link_url] = link_type
        elif 'canonical' in rels:
            canonical_url = link_url
        elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
            if lang := clean_lang(hreflang):
                alt_langs[lang] = link_durl.url()
        elif 'webmention' in rels:
            linkbacks[link_url] = 'webmention'
        elif 'pingback' in rels:
            linkbacks[link_url] = 'pingback'
    if canonical_url:
        if canonical_durl := await Durl(canonical_url):
            canonical_url = canonical_durl.site()
        else:
            canonical_url = None
    return {
        'feeds': feeds,
        'linkbacks': linkbacks,
        'alt_langs': alt_langs,
        'canonical_url': canonical_url,
    }
 async def collect_external_links(startpage, meta_links) -> dict[str, str]:
    """
    Return external links (mapping from URL to link text) from startpage.
    Also add links to alternate language variants of the site.
    """
    external_links = startpage.init_fields['links_ext'].copy()
    netloc = startpage.init_fields['durl'].netloc
    for lang, lang_url in meta_links['alt_langs'].items():
        if netloc not in lang_url:
            durl = await Durl(lang_url)
            if durl:
                external_links[durl] = f'Alternate language: {lang}'
    return external_links
 def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
    """
    Extract and return title, description, keywords from a page and meta tags.
    """
    title = meta.get('og:site_name')
    if not title:
        title = page.search_fields['title'] or ''
        if meta_title := meta.pop('title', None):
            if meta_title.lower() not in title.lower():
                title += ('; ' if title else '') + meta_title
    title = cut_str(clean_html(title), 200)
    description = cut_str(clean_html(meta.pop('description', None)), 2000)
    if meta_keywords := meta.pop('keywords', None):
        kws = re_meta_keyword_sep.split(meta_keywords)
        keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
        if len(keywords) < 2:
            keywords = [
                kw.strip()[:50]
                for kw in meta_keywords.split(' ')
                if kw.strip()
            ]
    else:
        keywords = []
    return title, description, keywords
 def extract_languages(page, meta, meta_links) -> set[str]:
    """
    Extract languages from a page's html tag, meta tags and HTTP headers.
    Also add the language detected in the text content of the page.
    Return a set of ISO 639-1 language codes.
    See also https://www.w3.org/International/questions/qa-http-and-lang and
    https://www.w3.org/International/questions/qa-html-language-declarations
    """
    languages = set()
    if lang := clean_lang(page.lang):
        languages.add(lang)
    if lang := clean_lang(meta.get('http_equiv_lang')):
        languages.add(lang)
    if lang := clean_lang(meta.get('dc.language')):
        languages.add(lang)
    if lang := clean_lang(meta.get('og:locale')):
        languages.add(lang)
    for lang, lang_url in meta_links['alt_langs'].items():
        if page.init_fields['durl'].netloc in lang_url:
            if lng := clean_lang(lang):
                languages.add(lng)
    lngs = (
        page.init_fields['headers']
        .get('Content-Language', '')
        .lower()
        .replace(' ', '')
        .split(',')
    )
    for lng in lngs:
        if lang := clean_lang(lng):
            languages.add(lang)
    languages.add(page.lang)
    return languages
--- a/src/atextcrawler/site/queue.py
+++ b/src/atextcrawler/site/queue.py
@ -0,0 +1,127 @@
 """
 Queue of sites.
 When processing a resource, its external links are put into database table
 `site_queue`.
 The items in `site_queue` are processed in :func:`process_site_queue`.
 This is done baseURL by baseURL (see :func:`iter_site_queue`).
 While doing this, cross-site links are put into table `site_link`.
 """
 import logging
 from typing import AsyncIterator, Optional
 import aiohttp
 from asyncpg import Connection
 from ..resource import ResourceFetcher
 from .operations import update_site
 logger = logging.getLogger(__name__)
 async def process_site_queue(app, pool):
    """
    Loop over queued sites creating new sites and adding cross-site links.
    """
    site_delay = app.config['crawl']['site_delay']
    resource_delay = app.config['crawl']['resource_delay']
    async with pool.acquire() as conn:
        async with aiohttp.ClientSession() as session:
            fetcher = ResourceFetcher(session)
            while app.running:
                async for base_url, links_from in iter_site_queue(app, conn):
                    # get or create site
                    msg = f'Site queue: updating {base_url}'
                    logger.debug(msg)
                    site, created = await update_site(
                        app, fetcher, conn, base_url
                    )
                    if site:
                        await store_incoming_site_site_links(
                            conn, site.id_, links_from
                        )
                    # delete handled queue items
                    sql = "DELETE FROM site_queue WHERE url=$1"
                    await conn.execute(sql, base_url)
                    await app.sleep(resource_delay)
                logger.debug(
                    f'Queued sites exhausted, sleeping'
                    f' for {site_delay} seconds'
                )
                await app.sleep(site_delay)
 async def iter_site_queue(
    app, conn: Connection
 ) -> AsyncIterator[tuple[str, dict[int, str]]]:
    """
    Yield URLs with aggregated link information from site_queue.
    Yield a URL and a dict mapping ids of linking sites to link texts.
    """
    site_revisit_interval = app.config['crawl']['site_revisit_interval']
    while app.running:
        sql = (
            "SELECT url, array_agg(src) srcs,"
            " array_agg(link_text) link_texts"
            " FROM site_queue GROUP BY url LIMIT 1"
        )
        row = await conn.fetchrow(sql)
        if row:
            base_url = row['url']
            links_from = {}
            srcs = row['srcs']
            link_texts = row['link_texts']
            for i in range(len(srcs)):
                if src := srcs[i]:
                    links_from[src] = link_texts[i]
            if site_id := await site_recently_updated(
                conn, base_url, site_revisit_interval
            ):
                # just store incoming links and remove the site from the queue
                await store_incoming_site_site_links(conn, site_id, links_from)
                sql = "DELETE FROM site_queue WHERE url=$1"
                await conn.execute(sql, base_url)
            else:
                yield base_url, links_from
        else:
            break
 async def site_recently_updated(
    conn: Connection,
    base_url: str,
    site_revisit_interval: float,
 ) -> Optional[int]:
    """
    Return the id of the site with given base_url if it was updated recently.
    """
    sql = (
        f"SELECT id FROM site WHERE $1=any(base_urls)"
        f" AND last_update + interval '{site_revisit_interval} seconds'"
        f" > now() at time zone 'utc' LIMIT 1"
    )
    site_id = await conn.fetchval(sql, base_url)
    return site_id
 async def store_incoming_site_site_links(
    conn: Connection, site_id: int, links_from: dict
 ):
    """
    Store incoming site-site links (irrespective of crawl_enabled).
    *site_id* is the id of the site to which the links in *links_from* point.
    """
    sql = (
        "INSERT INTO site_link"
        " (src, dst, link_text) VALUES ($1, $2, $3)"
        " ON CONFLICT (src, dst) DO NOTHING"
    )
    values = [
        (from_id, site_id, link_text)
        for from_id, link_text in links_from.items()
        if from_id != site_id
    ]
    await conn.executemany(sql, values)
--- a/src/atextcrawler/site/robots.py
+++ b/src/atextcrawler/site/robots.py
@ -0,0 +1,98 @@
 """
 Fetch and evaluate a website's robots.txt.
 """
 import logging
 from typing import Optional, Union
 from urllib.robotparser import RobotFileParser
 import aiohttp
 logger = logging.getLogger(__name__)
 class RobotsInfo(RobotFileParser):
    """
    Obtain information from a site's robots.txt.
    After instantiation you must await :meth:`startup`.
    """
    def __init__(
        self,
        site_url: str,
        user_agent: str = '*',
        session: aiohttp.ClientSession = None,
    ):
        super().__init__()
        self.__user_agent = user_agent
        self.__site_url = site_url.rstrip('/')
        self.__robots_url = self.__site_url + '/robots.txt'
        self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
        self.__session = session
    def __await__(self):
        return self.__ainit__().__await__()
    async def __ainit__(self):
        if self.__session:
            content = await self.__get_robots_txt(self.__session)
        else:
            async with aiohttp.ClientSession() as session:
                content = await self.__get_robots_txt(session)
        self.parse(content.splitlines())
        self.__delay = self.crawl_delay(self.__user_agent)
        request_rate = self.request_rate(self.__user_agent)
        if request_rate:
            self.__delay = request_rate.seconds / request_rate.requests
        self.__site_maps = super().site_maps() or []
        return self
    async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
        """
        Fetch and return the robots.txt over http.
        """
        try:
            async with session.get(
                self.__robots_url, timeout=self.__timeout
            ) as resp:
                if resp.status == 200:
                    try:
                        content = await resp.text()
                    except:
                        body = await resp.read()
                        content = body.decode(
                            resp.charset or 'utf-8', errors='ignore'
                        )
                else:
                    content = ''
        except aiohttp.ClientError:
            content = ''
        return content
    @property
    def user_agent(self) -> str:
        """
        The user agent being used.
        """
        return self.__user_agent
    @property
    def delay(self) -> Optional[Union[int, float]]:
        """
        The delay to be used between requests.
        """
        return self.__delay
    @property
    def site_maps(self) -> list[str]:
        """
        The list of sitemaps of the site.
        """
        return self.__site_maps
    def can_fetch_url(self, url: str) -> bool:
        """
        Return whether fetching of the given *url* is allowed.
        """
        return super().can_fetch(self.__user_agent, url)
--- a/src/atextcrawler/site/seed.py
+++ b/src/atextcrawler/site/seed.py
@ -0,0 +1,72 @@
 """
 Seeding of new installations with URLs from blacklists and whitelists.
 """
 from pathlib import Path
 import asyncpg
 from ..utils.durl import Durl
 async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
    """
    Add seed file contents (site blacklist and whitelist).
    If there are sites already, do nothing.
    """
    async with pool.acquire() as conn:
        site_count = await conn.fetchval("SELECT count(*) FROM site")
        if site_count:
            return
        # add blacklist entries
        values = []
        blacklist = _load_list(config['config_dir'], 'black')
        for base_url in blacklist:
            durl = await Durl(base_url)
            if durl:
                url = durl.site()
                values.append((url, {'source': 'seed file'}))
        sql = (
            "INSERT INTO site_annotation (base_url, ann_type, ann_content)"
            " VALUES ($1, 'blacklist', $2)"
        )
        await conn.executemany(sql, values)
        # add whitelist entries
        values1 = []
        values2 = []
        whitelist = _load_list(config['config_dir'], 'white')
        for base_url in whitelist:
            durl = await Durl(base_url)
            if durl:
                url = durl.site()
                if url not in blacklist:
                    values1.append((url, {'source': 'seed file'}))
                    values2.append((url,))
        sql = (
            "INSERT INTO site_annotation (base_url, ann_type, ann_content)"
            " VALUES ($1, 'whitelist', $2)"
        )
        await conn.executemany(sql, values1)
        sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
        await conn.executemany(sql, values2)
 def _load_list(config_dir, black_white):
    """
    Load the seed black or white list.
    """
    path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
    with open(path, 'r') as list_file:
        urls = []
        for line in list_file.read().strip().splitlines():
            line_ = line.strip()
            if line_.startswith('#'):
                continue
            if black_white == 'black' and line_.startswith('-'):
                urls.append(line_[1:].strip())
            if black_white == 'white' and line_.startswith('+'):
                urls.append(line_[1:].strip())
    return urls
--- a/src/atextcrawler/tensorflow.py
+++ b/src/atextcrawler/tensorflow.py
@ -0,0 +1,69 @@
 """
 Query the tensorflow_model_server's REST API.
 """
 import logging
 from typing import Optional, Union
 import aiohttp
 logger = logging.getLogger(__name__)
 class TensorFlow:
    """
    Fetch an embedding vector from the tensorflow model server.
    """
    def __init__(
        self,
        app,
        session: aiohttp.ClientSession,
        timeout_sock_connect: Union[int, float] = 0.5,
        timeout_sock_read: Union[int, float] = 10,
    ):
        self.config = app.config['tensorflow']
        self.session = session
        self.timeout = aiohttp.ClientTimeout(
            sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
        )
    async def embed(
        self, text: Union[str, list[str]]
    ) -> Optional[Union[list[float], list[list[float]]]]:
        """
        Query the tensorflow_model_server's REST API for a prediction.
        Take a string or a list of strings and return an embedding vector
        or a list of embedding vectors.
        If the request fails or times out, return None.
        """
        text_ = text if isinstance(text, list) else [text]
        data = {'signature_name': 'serving_default', 'instances': text_}
        try:
            async with self.session.post(
                self.config['model_server_endpoint'],
                json=data,
                timeout=self.timeout,
            ) as resp:
                try:
                    res = await resp.json()
                    if isinstance(text, list):
                        return res.get('predictions')
                    else:
                        return res.get('predictions')[0]
                except:
                    msg = 'Got invalid response from tensorflow'
                    logger.error(msg)
                    return None
        except Exception as err:
            msg = 'Could not get embedding from tensorflow for '
            if isinstance(text, str):
                msg += f'string of length {len(text)}'
            else:
                msg += 'list of strings with lengths '
                msg += ','.join([str(len(s)) for s in text])
            msg += f', reason: {err}'
            logger.error(msg)
            return None
--- a/src/atextcrawler/utils/init.py
+++ b/src/atextcrawler/utils/init.py
--- a/src/atextcrawler/utils/annotation.py
+++ b/src/atextcrawler/utils/annotation.py
@ -0,0 +1,481 @@
 """
 Convert html to plain text with annotations over character ranges.
 """
 import re
 from collections import defaultdict
 from html.parser import HTMLParser
 from .json import json_dumps, json_loads
 from .link import nofollow_link_rels
 from .tag import keep_tags, self_closing_tags
 MAX_HREF_LENGTH = 200
 """
 Maximum length of an href. Other links are discarded.
 """
 text_blacklist = [
    'previous',
    'next',
    'back',  # common pagination navigation
    '↩︎',  # amusewiki footnote separator (after conversion from muse to html)
 ]
 """
 Texts to ignore.
 """
 class AnnotatingParser(HTMLParser):
    """
    Parse tagged text resulting in pure text and annotations.
    The text is available in self.text and the annotations
    in self.annotations, which is a dict with these keys:
      * tags: contains a mapping of offset ranges (i, f) to
        the tags opening at i and closing at f
      * semantic_breaks: a mapping of offset positions where
        a new section begins to the nesting level of that
        sections; a section is whereever an (opening or closing)
        separating tag is placed in the raw html; for the
        separating flag of tags see tag.py
      * links: a mapping of hrefs to link texts obtained from
        anchor (a) tags; we skip hyperref with nofollow rels
      * section_ids: map an offset position to the first
        id attribute (of any tag) at the beginning of a
        semantic section; this can later be used in a URL
        fragment for linking directly into this section
    Internally, we put opening tags on self.stack and pop them
    when the first matching closing tag is encountered. We assume
    balanced tags (tidy html).
    NB: all tags with semantic breaks have sep=True, i.e.,
    they will have spaces around them so that the semantic breaks
    always sit on a space; the semantic break position p is the end
    of the last section and the next sections begins at p + 1.
    The text alway begins with a ' ' (added if not in the original),
    which is assigned a semantic break with default level 80
    (if there is no semantic break tag at the beginning).
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.text = ' '  # concatenated text data (without tags)
        self.pos = 1  # equal to len(self.text)
        self.stack = []
        self.tags = defaultdict(dict)
        self.semantic_breaks = {0: 80}
        self.tag_id = None
        self.section_ids = defaultdict(list)
        self.links = {}
        self.add_space = False
    def close(self):
        """
        Finish by collecting results in dict `self.annotations`.
        """
        super().close()
        self.annotations = {}
        self.annotations['links'] = self.links
        self.annotations['semantic_breaks'] = {
            pos: lvl for pos, lvl in sorted(self.semantic_breaks.items())
        }
        self.annotations['tags'] = self.tags
        self.annotations['section_ids'] = self.section_ids
    def handle_starttag(self, tag, attrs):
        """
        Called for each opening tag.
        """
        sep, lvl, sem = keep_tags[tag]
        attrs = dict(attrs)
        if sep:
            self.add_space = True
        if tag == 'section' and 'endnotes' in attrs.get('role', ''):
            lvl = 25
        # ARIA roles
        if role := attrs.get('role'):
            if role == 'article':
                lvl = 15
            elif role == 'heading':
                if aria_level := attrs.get('aria-level'):
                    if aria_level in (1, 2, 3, 4, 5, 6):
                        sep, lvl, sem = keep_tags[f'h{aria_level}']
            elif role == 'region':
                lvl = 24
        i = self.pos
        if tag in self_closing_tags:
            # self-closing tags will not be added to the result tags,
            # they only appear in semantic_breaks
            # the two self-closing tags br and hr both have lvl and sep
            if i == 1:  # replace the default semantic break at pos 0
                i = 0
            self.add_semantic_break(i, lvl)
            i += 1
            if tag_id := attrs.get('id'):
                self.tag_id = i, tag_id
                self.add_tag_id(i)  # br or hr may have an id, too
            self.add_space = True
        else:
            self.stack.append((i, tag, sep, lvl, sem, attrs))
            # forget outdated tag id at new semantic break
            if lvl:
                self.forget_tag_id()
            # memorize tag id
            if not self.tag_id and (tag_id := attrs.get('id')):
                self.tag_id = self.pos, tag_id
    def handle_endtag(self, tag):
        """
        Called for each closing tag.
        """
        if not self.stack or (self.stack and self.stack[-1][1] != tag):
            return  # nothing to do for an already closed self-closing tag
        i, tag_, sep, lvl, sem, attrs = self.stack.pop()
        f = self.pos
        # omit tag without content
        if i == f:
            return
        # for a closing div tag revise lvl to minimum level of contained
        # semantic breaks (if any)
        if tag == 'div':
            min_lvl = 101
            for pos_, lvl_ in reversed(self.semantic_breaks.items()):
                if pos_ <= i:
                    break
                min_lvl = min(min_lvl, lvl_)
            if min_lvl < 101:
                lvl = min_lvl
        # add semantic break and an optional section_id
        if lvl:
            if i == 1:  # replace the default semantic break at pos 0
                i = 0
            if tag in ('ul', 'ol', 'li'):
                seen_tags = [x[1] for x in self.stack]
                if 'p' not in seen_tags:
                    lvl = 52 + seen_tags.count('tag')
                    if tag == 'li':
                        lvl += 1
            self.add_semantic_break(i, lvl)
            self.add_tag_id(i)
        # do not include surrounding spaces in tag span
        if self.text[i] == ' ':
            i += 1
        # add tag
        self.tags[(i, f)][tag] = sem
        # add space (when handling next data)
        if sep:
            self.add_space = True
        # collect links
        if tag == 'a':
            self.extract_link(i, attrs)
    def handle_data(self, text):
        """
        Called for each non-tag content between tags.
        """
        # handle empty or blacklisted text
        if text == '':
            return
        if text == ' ':
            self.add_space = True
            return
        if text.strip().lower() in text_blacklist:
            if ' ' in text:
                self.add_space = True
            return
        # add a space (at self.pos) if the text begins with one
        # or if we shall add one
        startswith_space = text.startswith(' ')
        text = text.lstrip()
        if startswith_space or self.add_space:
            if self.text[-1] != ' ':
                self.text += ' '
                self.pos += 1
        self.add_space = False
        # strip a space at the end of text and handle it in end tag
        if text.endswith(' '):
            text = text[:-1]
            self.add_space = True
        # add text to self.text
        self.text += text
        self.pos += len(text)
    def add_semantic_break(self, pos, lvl):
        """
        Add a semantic break of level *lvl* at position *pos*.
        """
        if pos in self.semantic_breaks:
            self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl)
        else:
            self.semantic_breaks[pos] = lvl
    def forget_tag_id(self):
        """
        Reset a tag id if it is too far behind in the text stream.
        """
        if self.tag_id:
            pos_, tag_id = self.tag_id
            if pos_ + 200 < self.pos:
                self.tag_id = None
    def add_tag_id(self, pos):
        """
        Add and clear an id if the just closing section has none yet.
        *pos* is the start position of the current section, and the
        position where the id will be added.
        Add an id only if we are not too far in the section's text already.
        """
        if self.tag_id:
            pos_, tag_id = self.tag_id
            if pos_ < pos + 100 and pos not in self.section_ids:
                self.section_ids[pos].append(tag_id.lower())
        self.tag_id = None
    def extract_link(self, i, attrs):
        """
        Add a link covering character range (i, self.pos).
        From html *attrs* extract href and rel.
        """
        if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow':
            if href.startswith('#'):
                return
            if len(href) > MAX_HREF_LENGTH:
                return
            attrs.get('title', '')
            if rel := attrs.get('rel'):
                if set(rel) & nofollow_link_rels:
                    return
            self.links[href] = i, self.pos, rel
 def annotate(html):
    """
    Split html text into plain text with annotations (from AnnotatingParser).
    """
    parser = AnnotatingParser()
    parser.reset()
    parser.feed(html)
    parser.close()
    return parser.text, parser.annotations
 re_footnote = re.compile(r'^\s*\[\d+\]\s+')
 def headline_probability(text, tags, lvl) -> float:
    """
    Estimate the probability that the text with tags is a headline.
    The context is not considered: The question is not whether the
    text is a headline for the following text.
    """
    text = text.strip()
    res = 0.0
    if not text:
        return res
    if lvl < 60:
        return 1.0
    # if 'h1' in tags or 'h2' in tags or 'h3' in tags or\
    #    'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags:
    #        return 1.0
    if len(text) < 80:
        res = 0.7
    else:
        res = 0.7 - 0.7 * (len(text) - 80) / 200
    if 'p' in tags:
        res -= 0.4
    if 'em' in tags:
        res += 0.3
    if 'a' in tags:
        res -= 0.1
    if text[-1] in '.:':
        res -= 0.3
    res -= 0.1 * text.count(', ')
    if re_footnote.match(text):
        res -= 0.4
    return max(res, 0.0)
 def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]:
    """
    Return the info on the share of characters covered with one of the *tags*.
    Only consider the characters between i and f of string *text*.
    Return the number of tags that have an overlap in the specified region,
    the tag density in the region (fraction of covered characters by all),
    and the average number of covered chars per tag.
    NB: If more than one tag name is given, then the fractional share
    may exceed 1.
    """
    if i == f:
        return 0, 0.0, 0.0
    tag_count = 0
    covered_chars = 0
    for (s_i, s_f), anns in tags.items():
        if overlap := range_overlap(i, f - 1, s_i, s_f - 1):
            for ann in anns:
                if ann in tag_names:
                    tag_count += 1
                    covered_chars += overlap[1] - overlap[0]
    all_chars = f - i
    tag_density = covered_chars * 1.0 / all_chars
    avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0
    return tag_count, tag_density, avg_text_len
 def range_overlap(i1, f1, i2, f2):
    """
    Return the overlap of both ranges (None if there is none).
    """
    return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2))
 def annotations_remove_section(annotations, i, f):
    """
    Remove section (i, f) from annotations and return result.
    """
    new_annotations = {}
    d = f - i
    if not d:
        return annotations
    # relocate tags
    new_tags = {}
    for (t_i, t_f), anns in annotations['tags'].items():
        n_i, n_f = cut_range(i, f, d, t_i, t_f)
        if n_i is not None:
            new_tags[(n_i, n_f)] = anns
    new_annotations['tags'] = new_tags
    # relocate links
    new_links = {}
    for href, (l_i, l_f, rel) in annotations['links'].items():
        n_i, n_f = cut_range(i, f, d, l_i, l_f)
        if n_i is not None:
            new_links[href] = n_i, n_f, rel
    # relocate semantic breaks and section_ids
    semantic_breaks = annotations['semantic_breaks']
    section_ids = annotations['section_ids']
    new_semantic_breaks = {}
    new_section_ids = {}
    for pos in sorted(semantic_breaks.keys()):
        level = semantic_breaks[pos]
        if i <= pos and pos < f:
            continue  # discard
        elif f <= pos:
            new_semantic_breaks[pos - d] = level
            if pos in section_ids:
                new_section_ids[pos - d] = section_ids[pos]
        else:
            new_semantic_breaks[pos] = level
            if pos in section_ids:
                new_section_ids[pos] = section_ids[pos]
    # collect and return results
    new_annotations['semantic_breaks'] = new_semantic_breaks
    new_annotations['section_ids'] = new_section_ids
    new_annotations['links'] = new_links
    return new_annotations
 def cut_range(i, f, d, t_i, t_f):
    """
    Return the new coordinates of a text range (t_i,t_f) after cutting (i,f).
    If (t_i,t_f) is fully within (i,f), return None, None.
    """
    if t_f < i:
        return t_i, t_f
    elif t_i < i <= t_f <= f:
        return t_i, i
    elif t_i < i and f <= t_f:
        return t_i, t_f - d
    elif i <= t_i and t_f <= f:
        return None, None
    elif i <= t_i <= f < t_f:
        return i, t_f - d
    else:  # f < t_i
        return t_i - d, t_f - d
 def clean_annotations(annotations: dict) -> None:
    """
    Remove void stuff from annotations.
    """
    cleaned_tags = {}
    for (i, f), anns in annotations['tags'].items():
        if f > i and anns:
            cleaned_tags[(i, f)] = anns
    annotations['tags'] = cleaned_tags
 def pack_annotations(annotations):
    """
    Pack annotations to a special JSON string, reducing their volume a little.
    """
    return json_dumps(
        {
            'tags': _pack_tags(annotations['tags']),
            'semantic_breaks': ','.join(
                [
                    f'{pos}:{level}'
                    for pos, level in annotations['semantic_breaks'].items()
                ]
            ),
            'section_ids': annotations['section_ids'],
            'links': annotations['links'],
        }
    )
 def _pack_tags(tags: dict) -> str:
    """
    Utility function for packing tag information into a string.
    """
    res = ''
    for (i, f), anns in tags.items():
        if anns:
            anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()])
            res += f'{i}-{f}:{anns_}\n'
    return res
 def unpack_annotations(json_text: str) -> dict:
    """
    Unpack tag information from a string.
    """
    annotations = json_loads(json_text)
    tags = {}
    for line in annotations['tags'].split('\n'):
        if line:
            range_, anns_ = line.split(':')
            i, f = range_.split('-')
            i = int(i)
            f = int(f)
            anns = {}
            if anns_:
                for ann_ in anns_.split(','):
                    tag_, sem_ = ann_.split('=')
                    anns[tag_] = sem_
        tags[(i, f)] = anns
    semantic_breaks = {}
    for sb_ in annotations['semantic_breaks'].split(','):
        pos_, lvl_ = sb_.split(':')
        semantic_breaks[int(pos_)] = int(lvl_)
    return {
        'tags': tags,
        'semantic_breaks': semantic_breaks,
        'section_ids': annotations['section_ids'],
        'links': annotations['links'],
    }
--- a/src/atextcrawler/utils/date_finder.py
+++ b/src/atextcrawler/utils/date_finder.py
@ -0,0 +1,90 @@
 """
 Find date expressions in a string.
 """
 import re
 from datetime import datetime
 from typing import Optional
 p_day = r'(0?[1-9]|[12][0-9]|3[01])'
 p_month = r'(0?[1-9]|1[0-2])'
 p_year = r'(20\d\d|19\d\d)'
 sep = r'\D{1,2}'
 p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
 format_re = {
    'iso': (
        re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
        (1, 2, 3, 6, 7),
    ),
    'dmy': (
        re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
        (3, 2, 1, 6, 7),
    ),
    'mdy': (
        re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
        (3, 1, 2, 6, 7),
    ),
 }
 lang_format = {
    'de': ('iso', 'dmy'),
    'en': ('iso', 'mdy'),
    None: ('iso', 'dmy', 'mdy'),
 }
 def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
    """
    Extract the latest date compatible with the *lang* from *text*.
    Only consider dates in the past.
    """
    dates = extract_dates(text, lang=lang)
    return max(dates) if dates else None
 def extract_dates(text: str, lang: str = None) -> list[datetime]:
    """
    Extract dates form a string, optionally limiting formats to a language.
    """
    dates = []
    fmts = lang_format.get(lang, lang_format[None])
    for fmt in fmts:
        re_, slots = format_re[fmt]
        matches = re_.findall(text)
        if matches:
            for match in matches:
                try:
                    date = datetime(
                        int(match[slots[0]]),
                        int(match[slots[1]]),
                        int(match[slots[2]]),
                        int(match[slots[3]] or 0),
                        int(match[slots[4]] or 0),
                    )
                    if date <= datetime.utcnow():
                        dates.append(date)
                except:
                    pass
    return dates
 ## from htmldate import find_date
 # def extract_last_pub(html):
 #    """
 #    Return an estimate for the time of last content publication from html.
 #    """
 #    # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
 #    lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
 #    # publication date (from startpage)
 #    try:
 #        date_string = find_date(lxml_tree)
 #        pd = date.fromisoformat(date_string)
 #        last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
 #    except:
 #        last_pub = None
 #    return last_pub
--- a/src/atextcrawler/utils/durl.py
+++ b/src/atextcrawler/utils/durl.py
@ -0,0 +1,278 @@
 """
 Hyperlink parsing.
 """
 import logging
 from typing import Optional
 from urllib.parse import urlsplit
 import tldextract
 from async_dns import types
 from async_dns.resolver import ProxyResolver
 from async_lru import alru_cache
 from .link import in_blacklist
 logger = logging.getLogger(__name__)
 resolver = ProxyResolver(request_timeout=2)
 async_dns_logger = logging.getLogger('async_dns')
 async_dns_logger.setLevel(logging.WARNING)
 extract = tldextract.TLDExtract(cache_dir=False)
 # tldextract uses filelock; set its loglevel to warning
 filelock_logger = logging.getLogger('filelock')
 filelock_logger.setLevel(logging.WARNING)
 class Durl:
    """
    Decomposed URL, contains :class:`urllib.parse.SplitResult`.
    When constructing this class, it has to be awaited, e.g.:
         my_durl = await Durl('http://www.example.com/whatever')
    The given URL will be decomposed, validated and normalized.
    If the URL is invalid, we return None instead of an instance.
    If the given *base* is None, the URL must be absolute and
    the hostname must be valid (DNS lookup).
    If the given URL is not absolute, an already decomposed (and thus
    valid) *base* Durl must be given; otherwise the URL is invalid.
    The *base* Durl can contain a path (but no arguments or fragments),
    in which case the URL - if not absolute - must begin with this path.
    The scheme must be http or https. If the URL begins with '//',
    'http:' is prepended.
    If the hostname is longer than 90 characters, the URL is invalid.
    Default port numbers (80 for http, 443 for https) are removed.
    The hostname is changed to lower case. Spaces in the hostname
    make the URL invalid.
    URL fragments are removed.
    """
    _url = None
    _base = None
    _match_base = False
    def __init__(
        self,
        url: str,
        base: Optional['Durl'] = None,
        match_base: bool = False,
    ):
        self._url = url
        self._base = base
        self._match_base = match_base
    def __await__(self):
        return self.__ainit__().__await__()
    async def __ainit__(self):
        res = None
        try:
            # add missing scheme for urls beginning with '//'
            if self._url.startswith('//'):
                self._url = 'http:' + self._url
            # split the url
            durl = urlsplit(self._url)
            # remove default port numbers 80, 443
            netloc = durl.netloc
            if durl.port == 80 and durl.scheme == 'http':
                netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
            if durl.port == 443 and durl.scheme == 'https':
                netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
            if durl.hostname and durl.hostname != durl.netloc.lower():
                user_pass = ''
                if durl.username and durl.password:
                    user_pass = f'{durl.username}:{durl.password}@'
                port = ''
                if durl.port:
                    port = f':{durl.port}'
                netloc = f'{user_pass}{durl.hostname.lower()}{port}'
            durl = durl._replace(netloc=netloc)
            if self._base:
                # if missing fill in scheme and netloc from base
                if not durl.scheme:
                    durl = durl._replace(scheme=self._base.scheme)
                if not durl.netloc:
                    durl = durl._replace(netloc=self._base.netloc)
                # if match_base, then set res only if the
                # url is compatible with base url
                if not self._match_base:
                    res = durl
                else:
                    if durl.netloc == self._base.netloc:
                        if durl.scheme == self._base.scheme:
                            if self._base.path not in ('/', ''):
                                if durl.path.startswith(self._base.path):
                                    res = durl
                            else:
                                res = durl
            else:
                res = durl
        except:
            logger.exception(
                f'Durl init failed url={self._url}'
                f' base={self._base} match_base={self._match_base}'
            )
            res = None
        if res:
            res = res._replace(fragment='')
            if not res.hostname or len(res.hostname) > 90:
                res = None
            elif res.scheme not in ('https', 'http'):
                res = None
            elif ' ' in res.hostname or '.' not in res.hostname:
                res = None
            elif not (await get_ips(res.hostname)):
                res = None
            elif not res.path.startswith('/'):
                res = res._replace(path='/')
        if res:
            if res.fragment is None:
                res.fragment = ''
            self._durl = res
            return self
        self._durl = None
    def __getattr__(self, attr):
        return getattr(self._durl, attr)
    def url(self) -> str:
        """
        Return the URL as string.
        """
        return self._durl.geturl()
    def pwa(self) -> str:
        """
        Return the (base-relative) path with args of the Durl.
        """
        if self._base and self._match_base:
            path = self._durl.path.removeprefix(self._base.path)
        else:
            path = self._durl.path
        qs = f'?{self._durl.query}' if self._durl.query else ''
        return f'{path}{qs}'.lstrip('/')
    def has_path(self) -> bool:
        """
        Return whether the Durl has a non-trivil path.
        """
        return self._durl.path not in ('/', '')
    def site(self) -> str:
        """
        Return the site (base_url).
        """
        return f'{self._durl.scheme}://{self._durl.netloc}/'
    def domain(self) -> str:
        """
        Return the domain of the Durl (wrong in case of second-level domains).
        """
        levels = extract(self._durl.hostname)
        return '.'.join(levels[-2:]).lower()
    def replace_scheme(self, scheme: str) -> None:
        """
        Replace the scheme (must be 'http' or 'https').
        """
        self._durl = self._durl._replace(scheme=scheme)
@alru_cache(maxsize=1000)
 async def get_ips(hostname: str) -> set[str]:
    """
    Return IPv4 and IPv6 addresses of the given hostname.
    """
    ips = set()
    for type_ in (types.A, types.AAAA):
        try:
            res, cached = await resolver.query(hostname, type_)
            if res:
                if addr := res.get_record([type_]):
                    ips.add(addr.data)
        except:
            pass
    return ips
 def get_url_variants(url: str) -> list[str]:
    """
    Return variants of the URL.
    Replace http with https and vice versa;
    prepend or remove 'www.' to or from the beginning of the hostname.
    """
    if url.startswith('http://www.'):
        s = url.removeprefix('http://www.')
        return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
    elif url.startswith('http://'):
        s = url.removeprefix('http://')
        return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
    elif url.startswith('https://www.'):
        s = url.removeprefix('https://www.')
        return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
    elif url.startswith('https://'):
        s = url.removeprefix('https://')
        return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
    else:
        return [url]
 async def assort_links(
    links: dict[str, tuple[int, int, list[str]]],
    durl: Durl,
    text: str,
    base_url: str = None,
 ) -> tuple[
    dict[str, tuple[int, int, list[str]]],
    dict[Durl, tuple[list[str], str]],
    dict[Durl, tuple[list[str], str]],
 ]:
    """
    Sort links into a cleaned, an internal and an external dict.
    The cleaned dict maps absolute URLs to char ranges and relations.
    The internal dict maps absolute URLs to relations and the linked text.
    The external dict maps absolute URLs to relations and the linked text.
    The relations are link relations, e.g. rel="canonical".
    The base_url is set, it is used to distinguish internal and external
    links. If it is not set, the base_url is obtained from *durl*.
    """
    res_int = {}
    res_ext = {}
    if not base_url:
        base_url = durl.site().lower()
    base_durl = await Durl(base_url)
    cleaned_links = {}
    for href, (i, f, rel) in links.items():
        durl = await Durl(href, base=base_durl)
        if not durl:
            continue
        if durl.hostname and in_blacklist(durl.hostname):
            continue
        cleaned_links[durl.url()] = i, f, rel
        txt = text[i:f]
        if durl.site().lower() == base_url:
            res_int[durl] = rel, txt
        else:
            res_ext[durl] = rel, txt
    return cleaned_links, res_int, res_ext
--- a/src/atextcrawler/utils/html.py
+++ b/src/atextcrawler/utils/html.py
@ -0,0 +1,136 @@
 """
 Utilities for extracting information from html.
 """
 import re
 from html import unescape
 from typing import Optional
 from bs4 import BeautifulSoup
 from .lang import clean_lang
 from .tag import drop_roles, drop_tags, keep_tags
 re_ = {
    'html_lang': re.compile(
        '<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
    ),
    'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
    'strip': re.compile(
        '<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
    ),
    'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
    'whitespace': re.compile('(\s|&nbsp;)+', re.S),
    'whitespace_': re.compile('\s|&nbsp;?'),  # allow broken &nbsp
    'whitespace_near_tag': re.compile(
        '\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
        '|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
        re.S,
    ),
    'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
    'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
    'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
 }
 def whitespace_tag_tag(match_obj):
    """
    Helper function for removing whitespace between tags.
    """
    return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
 def clean_html(s: Optional[str]) -> Optional[str]:
    """
    Clean an html string.
    Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
    See also: https://www.lesinskis.com/python-unicode-whitespace.html
    """
    return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
 def get_html_lang(html: str) -> Optional[str]:
    """
    Return the language, if any, found in the lang attribute of the html tag.
    """
    m = re_['html_lang'].search(html)
    return clean_lang(m.group(1)) if m else None
 def extract_title(html: str) -> Optional[str]:
    """
    Extract title tags from html returning their content as a string.
    """
    if not (titles := re_['title'].findall(html)):
        return None
    titles = [clean_html(title) for title in reversed(titles) if title]
    return ' - '.join(titles).strip(' |')
 def clean_page(html):
    """
    Remove unwanted tags including their content from html.
    Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
    Also drop tags with attribute aria-hidden=true.
    Return a beautiful soup.
    """
    soup = BeautifulSoup(html, 'html.parser')
    for tag in drop_tags:
        for n in soup.find_all(tag):
            n.decompose()
    for n in soup.find_all(attrs={'aria-hidden': 'true'}):
        n.decompose()
    for role in drop_roles:
        for n in soup.find_all(attrs={'rel': role}):
            n.decompose()
    return soup
 def clean_body(body):
    """
    Clean an html body.
    Remove unwanted tags (keeping their content); remove empty tags;
    remove and replace whitespaces in several ways.
    In the end the only whitespace is a space and there are no
    consecutive spaces.
    """
    body = re_['strip'].sub(' ', body)
    body = re_['whitespace_near_tag'].sub(r'<\1>', body)
    body = re_['whitespace'].sub(' ', body)
    while re_['empty_tag'].search(body):
        body = re_['empty_tag'].sub(r'\3', body)
    body = re_['whitespace_near_tag'].sub(r'<\1>', body)
    body = re_['whitespace'].sub(' ', body)
    body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
    return body.strip().replace('\u00ad', '')  # soft hyphen
 def get_html_redirect(html: str) -> Optional[str]:
    """
    Return an html redirect in an http-equiv meta tag.
    If none is found, return None.
    """
    redir_url = None
    http_equivs = re_['http_equiv'].findall(html)
    for raw in http_equivs:
        tag = BeautifulSoup(raw, 'html.parser').meta
        if tag and tag.get('http-equiv', '').lower() == 'refresh':
            if content := tag.get('content'):
                try:
                    _, redir_url = content.split(';')
                    redir_url = (
                        redir_url.strip()
                        .removeprefix('url=')
                        .removeprefix('URL=')
                        .strip("'")
                    )
                except:
                    pass
    return redir_url
--- a/src/atextcrawler/utils/http.py
+++ b/src/atextcrawler/utils/http.py
@ -0,0 +1,58 @@
 """
 Utility functions related to http.
 """
 import re
 from typing import Optional
 from multidict import CIMultiDictProxy
 from ..models import Site
 from .durl import Durl
 re_ = {
    'link_header': re.compile(',\s*(?=<)'),
    'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
    'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
 }
 async def get_header_links(
    headers: CIMultiDictProxy,
    durl: Durl,
    site: Optional[Site],
 ) -> dict[str, Optional[str]]:
    """
    Extract canonical and shortlink links from http headers.
    *durl* must be the Durl of the fetched page and *site* - i fnon None -
    must be the Site to which the page belongs.
    Return a (default)dict with 'canonical' and 'shortlink' as keys.
    The values default to None.
    """
    res = {}
    canonical = shortlink = None
    if 'link' in headers and (link_headers := headers.getall('link')):
        links = []
        for link_header in link_headers:
            links += re_['link_header'].split(link_header)
        url = durl.url()
        base_url = site.base_url if site else url
        base_durl = await Durl(base_url) if base_url else None
        for link in links:
            if not canonical and 'canonical' in link.lower():
                if re_['rel_canonical'].search(link):
                    canon_url = link.strip().lstrip('<').split('>')[0]
                    if canon_durl := await Durl(canon_url, base=base_durl):
                        canonical = canon_durl.url()
            if not shortlink and 'shortlink' in link.lower():
                if re_['rel_shortlink'].search(link):
                    short_url = link.strip().lstrip('<').split('>')[0]
                    if short_durl := await Durl(short_url, base=base_durl):
                        shortlink = short_durl.url()
            if canonical and shortlink:
                break
    res['canonical'] = canonical
    res['shortlink'] = shortlink
    return res
--- a/src/atextcrawler/utils/json.py
+++ b/src/atextcrawler/utils/json.py
@ -0,0 +1,32 @@
 """
 Custom JSON encoder.
 """
 import json
 class JSONEncoderExt(json.JSONEncoder):
    """
    Extended JSON encoder with encoding of sets as lists.
    """
    def default(self, obj):
        """
        Encode sets as lists and everything else as by default.
        """
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)
 def json_dumps(obj):
    """
    Encode an object to a JSON string using JSONEncoderExt.
    """
    return json.dumps(obj, cls=JSONEncoderExt)
 json_loads = json.loads
 """
 Decoding of JSON strings as by default.
 """
--- a/src/atextcrawler/utils/lang.py
+++ b/src/atextcrawler/utils/lang.py
@ -0,0 +1,44 @@
 """
 Utility functions related to languages.
 """
 from pathlib import Path
 from typing import Optional
 import gcld3
 asset_path = Path(__file__).parent.parent / 'assets'
 with open(asset_path / 'iso_639-1', 'r') as f:
    iso_639_1_codes = f.read().strip().split('\n')
 lang_detector = gcld3.NNetLanguageIdentifier(
    min_num_bytes=0, max_num_bytes=1000
 )
 def clean_lang(lang: Optional[str]) -> Optional[str]:
    """
    Clean a language code string: it must be an ISO 639-1 code or None.
    """
    if lang is None:
        return None
    lang = lang[:2].lower()
    if lang in iso_639_1_codes:
        return lang
    return None
 def extract_content_language(text: str) -> Optional[str]:
    """
    Extract the language from a text.
    """
    if len(text) < 10:
        return None
    lang = None
    lang_det = lang_detector.FindLanguage(text=text)
    if lang_det.is_reliable:
        lang = lang_det.language[:2]
    return lang
--- a/src/atextcrawler/utils/link.py
+++ b/src/atextcrawler/utils/link.py
@ -0,0 +1,116 @@
 """
 Hyperlinks (a href, link).
 """
 from pathlib import Path
 from typing import Optional
 import tldextract
 nofollow_link_rels = set(
    [
        'nofollow',
        'search',
        'noreferrer',
        'noopener',
        'help',
        'license',
    ]
 )
 """
 Do not follow the hrefs in anchor tags with these values of the rel attribute.
 """
 meta_names = (
    'generator',
    'lang',
    'language',
    'description',
    'keywords',
    'author',
    'title',
    'subject',
    'revised',
    'abstract',
    'topic',
    'summary',
    'classfication',
    'category',
    'reply-to',
    'owner',
    'url',
    'identifier-URL',
    'geo.position',
    'geo.region',
    'geo.placename',
    'dc.language',
 )
 """
 Values of the name attribute of meta tags to keep.
 See also: https://gist.github.com/lancejpollard/1978404
 See also: https://github.com/joshbuchea/HEAD
 """
 meta_props = (
    'og:site_name',
    'og:locale',
    'og:type',
    'og:latitude',
    'og:longitude',
    'og:street',
    'og:locality',
    'og:region',
    'og:postal',
    'og:country',
 )
 """
 Values of the property attribute of meta tags to keep.
 """
 link_rels = set(
    [
        'webmention',
        'pingback',
        'alternate',
        'canonical',
        'author',
    ]
 )
 """
 Values of the rel attribute of link tags to keep.
 """
 def load_blacklist():
    """
    Return the 10000 most popular internet domains.
    """
    path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
    with open(path, 'r') as file:
        domains = file.read().strip().splitlines()
    return domains
 domain_blacklist = load_blacklist()
 def in_blacklist(hostname: str) -> Optional[str]:
    """
    Return a match of host in the blacklist, or None.
    """
    domain = extract_domain(hostname)
    if domain in domain_blacklist:
        return hostname
    return None
 def extract_domain(hostname: str) -> str:
    """
    Extract the lower-case domain from a hostname.
    """
    levels = tldextract.extract(hostname)
    return '.'.join(levels[-2:]).lower()
--- a/src/atextcrawler/utils/muse.py
+++ b/src/atextcrawler/utils/muse.py
@ -0,0 +1,120 @@
 """
 Parse muse-formatted plaintext (delivered by amusewiki).
 """
 import re
 from datetime import datetime
 from typing import Optional
 from .date_finder import extract_latest_date
 from .lang import clean_lang
 re_tag = re.compile(r'<[^<]+?>')
 def parse_muse(text: str) -> Optional[tuple[dict, str]]:
    """
    Parse a MUSE string returning meta information and the text body.
    """
    head, body = split_head_body(text)
    if not head:
        return None
    meta = parse_head(head)
    if not meta:
        return None
    return extract_muse_meta(meta, body), body
 def split_head_body(text: str) -> tuple[str, str]:
    """
    Split a MUSE string into head and body and return both.
    """
    head = ''
    while text.startswith('#'):
        line_end = text.find('\n') + 1
        head += text[:line_end]
        text = text[line_end:]
    return head.strip(), text.strip()
 def parse_head(text: str) -> dict:
    """
    Parse a MUSE head and return a dict mapping field names to values.
    """
    fields = {}
    for line in text.split('\n'):
        name, value = line.strip().split(' ', 1)
        fields[name[1:]] = value
    return fields
 amusewiki_fields = [
    'author',
    'title',
    'lang',
    'LISTtitle',  # reduced title for alphabetical sorting
    'subtitle',
    'SORTauthors',  # authors separated by ';' or ',' (only for indexing)
    'SORTtopics',  # topics separated by ';' or ',' (only for indexing)
    'date',  # publication year
    'pubdate',  # publication datetime
    'notes',  # additional info (orig title, translators, credits, ...)
    'source',  # preferred format: "Retrieved on March 8, 2012 from {URL}"
    'publisher',
    'isbn',
    #'rights',
    'seriesname',
    'seriesnumber',
    #'hyphenation',       # irrelevant
    #'slides',            # irrelevant
    #'DELETED',           # irrelevant
    #'cover',             # irrelevant
    #'coverwidth',        # irrelevant
    #'nocoverpage',       # irrelevant
    #'notoc',             # irrelevant
    #'nofinalpage',       # irrelevant
    #'impressum',         # irrelevant
    #'continuefootnotes', # irrelevant
    #'centerchapter',     # irrelevant
    #'centersection',     # irrelevant
 ]
 """
 Amusewiki fields are (cf. https://amusewiki.org/library/manual)
 """
 re_list = re.compile('[;,]')
 def extract_muse_meta(meta, body) -> dict:
    """
    Extract meta information from muse header and muse body.
    """
    authors = set()
    if author := meta.get('author', '').strip():
        authors.add(author)
    if sortauthors := meta.get('SORTauthors', '').strip():
        for author in re_list.split(sortauthors):
            if author_ := author.strip():
                authors.add(author_)
    pubdate = meta.get('pubdate').strip()
    pub_date: Optional[datetime] = None
    if pubdate:
        try:
            pub_date = datetime.fromisoformat(pubdate)
        except:
            pub_date = extract_latest_date(pubdate)
    summary = re_tag.sub('', body[:1000].split('\n\n')[0])
    return {
        'title': re_tag.sub('', meta.get('title', '')) or None,
        'authors': authors,
        'lang': clean_lang(meta.get('lang')),
        'keywords': [
            s.strip()
            for s in re_list.split(meta.get('SORTtopics', '').strip())
            if s.strip()
        ],
        'pub_date': pub_date,
        'summary': summary,
        'orig_source': meta.get('source', '').strip() or None,
    }
--- a/src/atextcrawler/utils/probe.py
+++ b/src/atextcrawler/utils/probe.py
@ -0,0 +1,22 @@
 """
 Utility functions for probing / sampling.
 """
 def extract_samples(items, n=5):
    """
    Extract up to n sample elements from the the given dict or list.
    If *items* is a dict return the elements from the list of keys.
    """
    l = len(items)
    if l <= n:
        return items
    poss = []
    step = (l + 1) / n
    for i in range(n):
        pos = int(step * i)
        if pos < l and (not poss or pos > poss[-1]):
            poss.append(pos)
    items_list = list(items)
    return [items_list[pos] for pos in poss]
--- a/src/atextcrawler/utils/section.py
+++ b/src/atextcrawler/utils/section.py
@ -0,0 +1,74 @@
 """
 Operations on text sections.
 Semantic breaks are character positions within a text (0-offset)
 where a new section begins. More precisely, the character position
 contains a space and only at the next position begins a tag that is
 semantically breaking (e.g., a h1 or a br).
 Each semantic break has a level, which means breaking strength.
 The lower the level (e.g., h1 has a lower level than h2), the
 stronger the break.
 Implicitly, if position 0 has no semantic break, a semantic break
 at position 0 with level 80 is added.
 Semantic breaks can be used to split a text into sections.
 The lower the maximum level of the semantic breaks taken into account,
 the coarser the segmentation and the fewer the sections.
 Each section is given the level of the semantic break at ist beginning.
 From another point of view, sections have levels indicating
 the segmentation depth.
 The levels for html tags are defined in tag.py.
 The *semantic_breaks* argument in the functions below
 is a dict mapping the character position of the semantic break
 to the level of a section beginning at this position
 (if segmentation is done at this or a higher level).
 """
 def iter_sections(text, semantic_breaks, max_level=59):
    """
    Iterate over sections, limiting to those with a maximum level.
    Yield (start_pos, end_pos, level, text).
    *text* is assumed to have the first semantic break at position 0.
    """
    n = len(text)
    last_pos = 0
    last_level = semantic_breaks.get(0, 80)
    for pos, level in sorted(semantic_breaks.items()):
        if level <= max_level and last_pos != pos:
            yield last_pos, pos, last_level, text[last_pos + 1 : pos]
            last_pos = pos
            last_level = level
    if last_pos < n:
        yield last_pos, n, last_level, text[last_pos:]
 def concat_section_texts(text, semantic_breaks, min_len=2000):
    """
    Try to concat consecutive sections into chunks with a minimum length.
    Yield (section_ids, combined_text).
    """
    n = len(text)
    last_pos = 0
    section_ids = []
    for section_id, pos in enumerate(semantic_breaks.keys()):
        if pos >= last_pos + min_len:
            if n - pos < min_len:
                for id_ in [
                    i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
                ]:
                    section_ids.append(id_)
                pos = n
            yield section_ids, text[last_pos:pos]
            last_pos = pos
            section_ids = []
        section_ids.append(section_id)
    if last_pos < n:
        yield section_ids, text[last_pos:]
--- a/src/atextcrawler/utils/similarity.py
+++ b/src/atextcrawler/utils/similarity.py
@ -0,0 +1,92 @@
 """
 Text similarity with simhash.
 """
 import logging
 from asyncpg import Connection
 from simhash import Simhash, SimhashIndex
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.ERROR)
 postgresql_bigint_offset = 9223372036854775808
 """
 Subtract this number to get a PostgreSQL bigint from a 64bit int.
 """
 def get_features(txt: str) -> list[str]:
    """
    Extract features from string for use with Simhash.
    """
    width = 3
    txt = txt.replace(' ', '').lower()
    return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))]
 def simhash_to_bigint(simhash: Simhash) -> int:
    """
    Convert a simhash to PostgreSQL's bigint value range.
    """
    return simhash.value - postgresql_bigint_offset
 def simhash_from_bigint(bigint: int) -> Simhash:
    """
    Convert a simhash from PostgreSQL's bigint to a Simhash instance.
    """
    return Simhash(bigint + postgresql_bigint_offset, log=logger)
 def get_simhash(text: str) -> Simhash:
    """
    Return the Simhash of the given text.
    """
    return Simhash(get_features(text), log=logger)
 async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex:
    """
    Return a simhash index with hashes of all stored resources of the site.
    """
    sql = (
        "SELECT r.id, r.simhash FROM site_path sp, resource r"
        " WHERE sp.site_id=$1 AND sp.resource_id=r.id"
    )
    rows = await conn.fetch(sql, site_id)
    objs = [
        (
            str(row['id']),
            Simhash(row['simhash'] + postgresql_bigint_offset, log=logger),
        )
        for row in rows
    ]
    return SimhashIndex(objs, k=3, log=logger)
 def create_simhash(
    index: SimhashIndex,
    resource_id: int,
    simhash_instance: Simhash,
 ) -> int:
    """
    Add a resource with given id and simhash to a simhash index.
    Return the simhash value shifted into PostgreSQL's bigint range.
    (The simhash field of the resource's database entry is not updated.)
    """
    index.add(str(resource_id), simhash_instance)
    return simhash_to_bigint(simhash_instance)
 def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]:
    """
    Return the ids of similar resources from the index.
    """
    found = index.get_near_dups(simhash_inst)
    if found:
        return sorted([int(elem) for elem in found])
    return []
--- a/src/atextcrawler/utils/tag.py
+++ b/src/atextcrawler/utils/tag.py
@ -0,0 +1,189 @@
 """
 Information collections related to html tags.
 """
 drop_tags = [
    'applet',
    'area',
    'audio',
    'base',
    'basefont',
    'bdi',
    'bdo',
    'button',
    'canvas',
    'code',
    'command',
    'data',
    'datalist',
    'dir',
    'embed',
    'fieldset',
    'figure',
    'form',
    'frame',
    'frameset',
    'iframe',
    'img',
    'input',
    'label',
    'legend',
    'map',
    'menuitem',
    'meter',
    'noframes',
    'noscript',
    'object',
    'optgroup',
    'option',
    'param',
    'picture',
    'progress',
    'rp',
    'rt',
    'ruby',
    'samp',
    'script',
    'select',
    'source',
    'style',
    'svg',
    'template',
    'textarea',
    'track',
    'var',
    'video',
 ]
 """
 Tags to drop, including their content.
 """
 keep_tags = {
    'a': (0, 0, ''),
    'abbr': (0, 0, 'st'),
    'acronym': (0, 0, 'st'),
    'address': (1, 0, 'm'),
    'article': (1, 15, ''),
    'aside': (1, 0, 'd'),
    'b': (0, 0, 'st'),
    'blockquote': (1, 65, 'q'),
    'br': (1, 80, ''),
    'caption': (1, 68, ''),
    'center': (1, 50, ''),
    'cite': (1, 0, 'd'),
    'col': (1, 75, ''),
    'colgroup': (1, 73, ''),
    'dd': (1, 70, 'li'),
    'del': (0, 0, 'se'),
    'details': (1, 0, 'd'),
    'dfn': (0, 0, 'st'),
    'div': (1, 60, ''),  # lvl often revised to min of contained tags
    'dl': (1, 70, 'l'),
    'dt': (1, 70, 'li'),
    'em': (0, 0, 'st'),
    'figcaption': (1, 0, ''),
    'font': (0, 0, 's'),
    'footer': (1, 15, ''),
    'h1': (1, 30, ''),
    'h2': (1, 32, ''),
    'h3': (1, 34, ''),
    'h4': (1, 36, ''),
    'h5': (1, 38, ''),
    'h6': (1, 40, ''),
    'header': (1, 15, ''),
    'hr': (1, 30, ''),
    'i': (0, 0, 'st'),
    'ins': (0, 0, 'se'),
    'li': (1, 75, 'li'),  # lvl revised if not inside p
    'main': (1, 10, ''),
    'mark': (0, 0, 's'),
    'nav': (1, 0, ''),  # keep for footnotes
    'ol': (1, 70, 'l'),  # lvl revised if not inside p
    'p': (1, 60, ''),
    'pre': (1, 65, 'q'),
    'q': (1, 0, 'q'),
    's': (0, 0, ''),
    'section': (1, 24, ''),
    'small': (0, 0, 'd'),
    'span': (0, 0, 's'),
    'strike': (0, 0, 'se'),
    'strong': (0, 0, 'st'),
    'sub': (0, 0, ''),
    'summary': (1, 20, 'm'),
    'sup': (0, 0, ''),
    'table': (1, 65, ''),
    'tbody': (1, 70, ''),
    'td': (1, 78, ''),
    'tfoot': (1, 70, ''),
    'th': (1, 75, ''),
    'thead': (1, 70, ''),
    'time': (0, 0, 'm'),
    'tr': (1, 75, ''),
    'u': (0, 0, 's'),
    'ul': (1, 70, 'l'),  # lvl revised if not inside p
 }
 """
 Tags to keep for annotation, and their properties.
 The properties are:
  * sep: whether to separate text at both sides of the tag with a space
  * lvl: structural depth level of content of this tag;
         the paragraph level is 60; headings are below 60, listings above;
         a div below the tag will usually have the tag's depth + 1
  * sem: semantic categories: zero or more of
    * s=span
    * l=listing
    * i=list_item
    * t=term
    * e=edit
    * d=details
    * q=quote
    * m=meta
    * x=exclude
 """
 self_closing_tags = ('br', 'hr')
 """
 Those among keep_tags which are self-closing.
 """
 all_self_closing_tags = (
    'area',
    'base',
    'br',
    'col',
    'embed',
    'hr',
    'img',
    'input',
    'link',
    'meta',
    'param',
    'source',
    'track',
    'wbr',
 )
 """
 All self-closing tags of the html standard.
 """
 drop_roles = (
    'banner',
    'complementary',
    'contentinfo',
    'dialog',
    'figure',
    'form',
    'img',
    'search',
    'switch',
 )
 """
 Drop tags with these aria roles.
 """
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1,7 @@
 from .annotation import AnnotateTest
 from .date_finder import DateFinderTest
 from .page import PageCleanTest
 from .section import IterSectionTest, AggSectionTest
 from .simhash import SimhashTest
 from .text import CleanHtmlTest
 from .durl import DurlTest
--- a/tests/annotation.py
+++ b/tests/annotation.py
@ -0,0 +1,49 @@
 """
 Test cases for resource type page.
 """
 from unittest import TestCase
 from atextcrawler.utils.annotation import annotate
 class AnnotateTest(TestCase):
    """
    Test annotation.
    Consider that the <br> and <hr> tags are self-closing.
    """
    def test_annotate_1(self):
        s = '<em>Hello</em><br><strong>world</strong>'
        text, anns = annotate(s)
        self.assertEqual(text, ' Hello world')
        self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
        self.assertEqual(anns['section_ids'], {})
    def test_annotate_2(self):
        s = '<em> Hello </em><br><strong> world </strong>'
        text, anns = annotate(s)
        self.assertEqual(text, ' Hello world')
        self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
        self.assertEqual(anns['section_ids'], {})
    def test_annotate_3(self):
        s = '<p> Hello <em>world</em> </p> '
        text, anns = annotate(s)
        self.assertEqual(text, ' Hello world')
        self.assertEqual(anns['semantic_breaks'], {0: 60})
    def test_annotate_4(self):
        s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
        text, anns = annotate(s)
        self.assertEqual(text, ' Hello world')
        self.assertEqual(anns['semantic_breaks'], {0: 60})
        self.assertEqual(anns['section_ids'], {0: ['ref1']})
    def test_annotate_5(self):
        s = '<br id="ref2"> Hello <p>world </p> '
        text, anns = annotate(s)
        self.assertEqual(text, ' Hello world')
        self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
        self.assertEqual(anns['section_ids'], {1: ['ref2']})
--- a/tests/date_finder.py
+++ b/tests/date_finder.py
@ -0,0 +1,20 @@
 from datetime import datetime
 from unittest import TestCase
 from atextcrawler.utils.date_finder import extract_latest_date
 class DateFinderTest(TestCase):
    def test_extract_latest_date(self):
        s = 'test 1987-2+1-no'
        r = datetime(1987, 2, 1)
        self.assertEqual(extract_latest_date(s), r)
        s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
        r = datetime(2020, 4, 6)
        self.assertEqual(extract_latest_date(s, lang='de'), r)
        s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
        r = datetime(2021, 1, 20)
        self.assertEqual(extract_latest_date(s, lang='en'), r)
        s = ''
        r = None
        self.assertEqual(extract_latest_date(s), r)
--- a/tests/durl.py
+++ b/tests/durl.py
@ -0,0 +1,68 @@
 from unittest import IsolatedAsyncioTestCase
 import asyncpg
 from atextcrawler.utils.durl import Durl
 from atextcrawler.config import Config
 from atextcrawler.db import PGPool
 class DurlTest(IsolatedAsyncioTestCase):
    async def asyncSetUp(self):
        config = Config().get()
        self.pool = PGPool(config['postgresql'])
        await self.pool.__aenter__()
        self.conn = await self.pool.pool.acquire()
    async def test_durl_basic(self):
        durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
        self.assertEqual(durl1.scheme, 'https')
        self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
        self.assertEqual(durl1.port, 8000)
        self.assertEqual(durl1.path, '/hello')
        self.assertEqual(durl1.fragment, '')
        self.assertEqual(durl1.pwa(), 'hello?world')
        self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
        self.assertEqual(
            durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
        )
        self.assertEqual(durl1.has_path(), True)
        durl2 = await Durl('http://www.example.com/')
        self.assertEqual(durl2.has_path(), False)
        durl3 = await Durl('ftp://www.example.com/')
        self.assertEqual(durl3, None)
    async def test_durl_with_base(self):
        durl1 = await Durl('https://www.example.com')
        self.assertEqual(durl1.path, '/')
        self.assertEqual(durl1.pwa(), '')
        self.assertEqual(durl1.has_path(), False)
        durl2 = await Durl('https://www.example.com/hello2', base=durl1)
        self.assertEqual(durl2.hostname, 'www.example.com')
        self.assertEqual(durl2.path, '/hello2')
        self.assertEqual(durl2.pwa(), 'hello2')
        durl3 = await Durl('/hello3?x=1', base=durl1)
        self.assertEqual(durl3.hostname, 'www.example.com')
        self.assertEqual(durl3.path, '/hello3')
        self.assertEqual(durl3.pwa(), 'hello3?x=1')
        self.assertEqual(durl3.site(), 'https://www.example.com/')
        durl4 = await Durl('https://www.kernel.org/', base=durl1)
        self.assertEqual(durl4, None)
    async def test_durl_with_base_and_match_base(self):
        durl1 = await Durl('https://www.example.com/base/path/')
        self.assertEqual(durl1.path, '/base/path/')
        self.assertEqual(durl1.pwa(), 'base/path/')
        self.assertEqual(durl1.has_path(), True)
        durl2 = await Durl(
            'https://www.example.com/base/', base=durl1, match_base=True
        )
        self.assertEqual(durl2, None)
        durl3 = await Durl(
            'https://www.example.com/base/path/whatever?x=1#a',
            base=durl1,
            match_base=True,
        )
        self.assertEqual(durl3.pwa(), 'whatever?x=1')
    async def asyncTearDown(self):
        await self.pool.pool.release(self.conn)
        await self.pool.pool.close()
--- a/tests/page.py
+++ b/tests/page.py
@ -0,0 +1,24 @@
 """
 Test cases for resource type page.
 """
 from unittest import TestCase
 from atextcrawler.utils.html import clean_body
 # from atextcrawler.utils.tag import drop_tags
 class PageCleanTest(TestCase):
    def test_clean_body_1(self):
        s = ' <em>Hello</em> <strong>world</strong> '
        r = '<em>Hello</em> <strong>world</strong>'
        self.assertEqual(clean_body(s), r)
 #    def test_drop_tags(self):
 #        s = '<figure what="ever">something<figure>else</figure>...</figure>'
 #        r = drop_tags(s)
 #        self.assertEqual(r, '')
 #        s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
 #        r = drop_tags(s)
 #        self.assertEqual(r, '')
--- a/tests/section.py
+++ b/tests/section.py
@ -0,0 +1,105 @@
 from unittest import TestCase
 from atextcrawler.utils.section import concat_section_texts, iter_sections
 class IterSectionTest(TestCase):
    def test_iter_sections_1(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 80, 5: 2, 15: 1, 20: 3}
        sections1 = list(iter_sections(s, sb, max_level=100))
        sections2 = [
            (0, 5, 80, 'bcde'),
            (5, 15, 2, 'ghijklmno'),
            (15, 20, 1, 'qrst'),
            (20, 26, 3, 'uvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_iter_sections_2(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
        sections1 = list(iter_sections(s, sb, max_level=100))
        sections2 = [
            (0, 5, 4, 'bcde'),
            (5, 15, 2, 'ghijklmno'),
            (15, 20, 1, 'qrst'),
            (20, 26, 3, 'vwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_iter_sections_3(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {5: 2, 15: 60, 18: 50, 20: 3}
        sections1 = list(iter_sections(s, sb, max_level=59))
        sections2 = [
            (0, 5, 80, 'bcde'),
            (5, 18, 2, 'ghijklmnopqr'),
            (18, 20, 50, 't'),
            (20, 26, 3, 'uvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_iter_sections_4(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
        sections1 = list(iter_sections(s, sb, max_level=59))
        sections2 = [
            (0, 5, 80, 'bcde'),
            (5, 18, 2, 'ghijklmnopqr'),
            (18, 20, 50, 't'),
            (20, 26, 3, 'uvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
 class AggSectionTest(TestCase):
    def test_concat_sections_1(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 1, 5: 1, 15: 1, 20: 1}
        sections1 = list(concat_section_texts(s, sb, min_len=10))
        sections2 = [
            ([0, 1], 'abcdefghijklmno'),
            ([2, 3], 'pqrstuvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_concat_sections_2(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
        sections1 = list(concat_section_texts(s, sb, min_len=10))
        sections2 = [
            ([0, 1], 'abcdefghij'),
            ([2, 3, 4], 'klmnopqrstuvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_concat_sections_3(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
        sections1 = list(concat_section_texts(s, sb, min_len=10))
        sections2 = [
            ([0, 1, 2], 'abcdefghijklmnop'),
            ([3, 4], 'qrstuvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_concat_sections_4(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 1, 5: 1, 15: 1, 26: 1}
        sections1 = list(concat_section_texts(s, sb, min_len=10))
        sections2 = [
            ([0, 1], 'abcdefghijklmno'),
            ([2, 3], 'pqrstuvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
    def test_concat_sections_5(self):
        s = 'abcdefghijklmnopqrstuvwxyz'
        sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
        sections1 = list(concat_section_texts(s, sb, min_len=10))
        sections2 = [
            ([0, 1], 'abcdefghijkl'),
            ([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
        ]
        self.assertEqual(sections1, sections2)
--- a/tests/simhash.py
+++ b/tests/simhash.py
@ -0,0 +1,54 @@
 """
 Test cases for text util.
 """
 from unittest import TestCase
 from simhash import Simhash, SimhashIndex
 from atextcrawler.utils.similarity import (
    create_simhash,
    get_features,
    get_simhash,
    postgresql_bigint_offset,
    search_simhash,
 )
 class SimhashTest(TestCase):
    """
    Test simhash creation and search.
    """
    def test_search(self):
        n1 = int('1111111100000000', 2)
        n2 = int('1111111100000111', 2)
        n3 = int('1000000000000000', 2)
        n4 = int('1000000000000111', 2)
        n5 = int('1000001111000000', 2)
        objs = [
            ('1', Simhash(n1)),
            ('3', Simhash(n3)),
            ('4', Simhash(n4)),
        ]
        index = SimhashIndex(objs, k=3)
        found = search_simhash(index, Simhash(n5))
        self.assertEqual(found, [])
        found = search_simhash(index, Simhash(n1))
        self.assertEqual(found, [1])
        found = search_simhash(index, Simhash(n2))
        self.assertEqual(found, [1])
        found = search_simhash(index, Simhash(n4))
        self.assertEqual(found, [3, 4])
    def test_create(self):
        index = SimhashIndex([], k=3)
        hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
        hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
        simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
        simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
        found = search_simhash(index, simhash_1)
        self.assertEqual(found, [101])
        found = search_simhash(index, simhash_2)
        self.assertEqual(found, [102])
        simhash_3 = get_simhash('hello ' * 20 + 'X')
        found = search_simhash(index, simhash_3)
        self.assertEqual(found, [101])
--- a/tests/text.py
+++ b/tests/text.py
@ -0,0 +1,65 @@
 """
 Test cases for text util.
 """
 from unittest import TestCase
 from atextcrawler.utils.html import clean_page
 class CleanHtmlTest(TestCase):
    """
    Test clean_page.
    Have an eye on self-closing tags (br, hr, ...).
    """
    def test_clean_page_1(self):
        s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
        r = '<em>Hello</em><br/>anything'
        self.assertEqual(str(clean_page(s)), r)
    def test_clean_page_2(self):
        s = '<em>Hello</em><br /><script>malicious<script></script>anything'
        r = '<em>Hello</em><br/>anything'
        self.assertEqual(str(clean_page(s)), r)
    def test_clean_page_3(self):
        # nesting
        s = '--<figure>xx<figure>yy</figure>zz</figure>..'
        r = '--..'
        self.assertEqual(str(clean_page(s)), r)
    def test_clean_page_4(self):
        # aria-hidden
        s = '--<p aria-hidden=true>xx</p>..'
        r = '--..'
        self.assertEqual(str(clean_page(s)), r)
        s = '--<p aria-hidden="true">xx</p>..'
        r = '--..'
        self.assertEqual(str(clean_page(s)), r)
        s = '--<p aria-hidden=false>xx</p>..'
        r = '--<p aria-hidden="false">xx</p>..'
        self.assertEqual(str(clean_page(s)), r)
        s = '--<p aria-hidden="false">xx</p>..'
        r = '--<p aria-hidden="false">xx</p>..'
        self.assertEqual(str(clean_page(s)), r)
        s = '--<p aria-hidden=??>xx</p>..'
        r = '--<p aria-hidden="??">xx</p>..'
        self.assertEqual(str(clean_page(s)), r)
    def test_clean_page_5(self):
        # no removal
        s = '--<p>xx<em>yy</em></p>..'
        r = '--<p>xx<em>yy</em></p>..'
        self.assertEqual(str(clean_page(s)), r)
    def test_clean_page_6(self):
        # self-closing tags to be removed
        s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
        r = '--<p>xx</p>\n...<h1>tt</h1>nn'
        self.assertEqual(str(clean_page(s)), r)
    def test_clean_page_7(self):
        s = '--<p rel=search>tt<area /></p>nn'
        r = '--nn'
        self.assertEqual(str(clean_page(s)), r)