Put under version control

This commit is contained in:
ibu 2021-11-29 09:16:31 +00:00
parent d26d23348b
commit a6af5b12d2
83 changed files with 20130 additions and 0 deletions

51
.gitignore vendored Normal file
View file

@ -0,0 +1,51 @@
# Backup files
*.~
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
NOTES
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
.tox/
.coverage
.cache
nosetests.xml
coverage.xml
htmlcov
# Translations
*.mo
# mypy cache
.mypy_cache
# Sphinx documentation
doc/build/
doc/source/reference/
# tmp dir
tmp/

30
.pre-commit-config.yaml Normal file
View file

@ -0,0 +1,30 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 21.11b1
hooks:
- id: black
- repo: https://github.com/timothycrosley/isort
rev: 5.10.1
hooks:
- id: isort
args: ["--profile", "black", "--filter-files", "-l", "79"]
- repo: https://github.com/myint/autoflake
rev: v1.4
hooks:
- id: autoflake
args:
[
"--in-place",
"--remove-all-unused-imports",
"--ignore-init-module-imports",
"--remove-unused-variables",
]

46
Pipfile Normal file
View file

@ -0,0 +1,46 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
aiohttp = "*"
async-lru = "*"
asyncpg = "*"
beautifulsoup4 = "*"
elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] }
elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" }
feedparser = "*"
gcld3 = "*"
# TODO: recheck
pypandoc = "*"
pytidylib = "*"
pytz = "*"
pyyaml = "*"
tika = "*"
tldextract = "*"
voluptuous = "*"
simhash = "*"
async-dns = "*"
types-pyyaml = "*"
sphinx-rtd-theme = "*"
[dev-packages]
mypy = "*"
pre-commit = "*"
sphinx = "*"
myst-parser = "*"
isort = "*"
blacken-docs = "*"
pybetter = "*"
interrogate = "*"
autoflake = "*"
types-pyyaml = "*"
types-pytz = "*"
black = "*"
[requires]
python_version = "3.9"
[pipenv]
allow_prereleases = true

1561
Pipfile.lock generated Normal file

File diff suppressed because it is too large Load diff

13
README.md Normal file
View file

@ -0,0 +1,13 @@
atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search.
Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch)
atextcrawler crawls and indexes selected websites.
It starts from a few seed sites and follows their external links.
Criteria defined in plugin code determine which linked sites (and
which of their resources) are (recursively) added to the pool.
atextcrawler is written in Python, runs a configurable number of
async workers concurrently (in one process), uses tensorflow for
embedding (paragraph-sized) text chunks in a (multi-)language model
and stores metadata in PostgreSQL and texts in elasticsearch.

20
doc/Makefile Normal file
View file

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

71
doc/source/conf.py Normal file
View file

@ -0,0 +1,71 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
import os
import sys
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
sys.path.insert(0, proj_dir + '/src')
# -- Project information -----------------------------------------------------
project = 'atextcrawler'
copyright = '2021, ibu radempa'
author = 'ibu radempa'
# The full version, including alpha/beta/rc tags
release = '0.1.0'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'myst_parser',
'sphinx.ext.graphviz',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
autosummary_generate = True
source_suffix = {
'.rst': 'restructuredtext',
'.md': 'markdown',
}

View file

@ -0,0 +1,23 @@
# Initial URLs (first run only)
#
# To whitelist a URL prepend '+', to blacklist prepend '-'.
# Comment lines must begin with '#'.
# de
+http://agd.blogsport.de/
+https://blackblogs.org/blogs/
+https://fau.org/
+http://anarchiv.de/
+http://olaf.bbm.de/die-aktion
-https://www.anarchistischefoderation.de/
# en
+https://anarchistarchivist.com/
+https://bookshelf.theanarchistlibrary.org/library/
+https://archive.elephanteditions.net/library/
+https://blackrosefed.org/
+https://alpineanarchist.org/
+https://nostate.net/
+https://abolishing.blackblogs.org/
+http://library.nothingness.org/
-https://www.anarchistfederation.net/

View file

@ -0,0 +1,88 @@
# Name of this instance
# Default value: atextcrawler
# Allowed values: arbitrary string
instance_name: atextcrawler
# Which kind of instance is this?
# Default value: prod
# Allowed values are:
# - 'dev': development instance
# - 'staging': staging instance
# - 'prod': production instance
instance_type: prod
# Log level
# Default value: info
# Allowed values: critical, error, warning, info, debug
log_level: info
# Plugins directory
# If given as relative path, it will be relative to the
# directory of this file (main.yaml).
# Read documentation on plugins.
# Default value: plugins
# Hint: Create a empty __init__.py in the plugins_dir.
plugins_dir: plugins
# Parameters for access to the PostgreSQL service
# No default values; must be set.
postgresql:
host: localhost
port: 5432
database: atextcrawler
user: atextcrawler
password: ________________________
# Crawling
crawl:
# Number of concurrent workers
# Default value: 10
# Allowed values: integer >=0 and <=1000
#workers: 3
# Delay in seconds between attempts to fetch items
# from site_queue if the last attempt gave no item
# Also the delay in seconds after a worker has found
# no site to process
# Default value: 600
# Allowed values: positive number
#site_delay: 10
# Time interval in seconds between site updates when
# handling queued base URLs
# Default value: 3600
# Allowed values: positive number
#site_revisit_interval: 3600
# Delay in seconds between attempts to process
# individual resources (pages etc.) of a site
# Default value: 5
# Allowed values: positive number
#resource_delay: 3
# Default interval in seconds between full crawls of a site
# Default value: 864000 (10 days)
# Allowed values: positive number
#full_crawl_interval: 864000
# Default interval in seconds between feed crawls of a site
# Default value: 86400 (1 day)
# Allowed values: positive number
#feed_crawl_interval: 86400
# Parameters for access to the ElasticSearch service
# No default values; must be set.
elasticsearch:
# host on which ES is running
host: localhost
# API key for accessing ES
api_key: "**********************"
# API user id
id: "**********************"
# Index base name (actual index names will have '_text' etc. appended)
index_base_name: atext
# Tensorflow access
tensorflow:
# The prediction endpoint of the model server's sentence model
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict

View file

@ -0,0 +1,22 @@
"""
Filter paths found in a resource.
This plugin implements :func:`rp_filter`.
"""
from typing import Optional
def rp_filter(site, durl) -> Optional[str]:
"""
Adjust or filter found paths (may depend on site).
To filter out a path (i.e., not add it to table `site_path`)
return None.
"""
path = durl.pwa()
# skip fetching images (linked from a tags; img tags are skipped anyway)
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
return None
path = path.removesuffix('?amp=1')
return path

View file

@ -0,0 +1,47 @@
"""
Relevance estimation of sites.
This plugin implements :func:`site_filter`.
"""
import re
from atextcrawler.models import Site
MIN_RELEVANCE_SCORE = 5
async def site_filter(site: Site) -> bool:
"""
Assess relevance of the site (using language-dependent criteria).
If the site shall be crawled, return True, else False.
"""
# limit to sites in English or German language
if not set(['de', 'en']) & set(site.langs):
return False
score = 0.0
for crit_name, weight, langs, crit_re in re_criteria:
if '*' in langs or set(langs) & set(site.langs):
findings = crit_re.findall(site.startpage_text)
if findings:
score += weight * len(findings)
if site.title and crit_re.search(site.title):
score += 4 * weight
if site.description and crit_re.search(site.description):
score += 4 * weight
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
return score >= MIN_RELEVANCE_SCORE
re_criteria = {
(
'anarch',
1.0,
('*',),
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
),
('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
}

View file

@ -0,0 +1,24 @@
"""
Plugin for filtering paths of a site to be retrieved.
This plugin implements :func:`sp_filter`.
"""
def sp_filter(site, path, robots) -> bool:
"""
Per-site path filter. Return whether the path shall be retrieved.
"""
if not robots.can_fetch_url(site.base_url + path):
return False
if 'amusewiki' in site.meta_info.get('generator', '').lower():
if any(
[
path.endswith(end)
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
]
):
return False
if '/bbselect?' in path:
return False
return True

63
doc/source/devel/devel.md Normal file
View file

@ -0,0 +1,63 @@
## Setup dev environment
1. You need python 3.9 or later.
1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
1. Clone the repo and setup a virtualenv:
```
cd YOUR_DEV_DIR
git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
cd atextcrawler
pipenv install -d
```
## Configure the instance
See [installation](installation.md).
## Run
```
python -m atextcrawler
```
## Logging
Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
```
journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
```
## Upgrading
Upgrade dev tools:
```
pre-commit autoupdate
```
## Test and clean manually
```
AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
mypy --ignore-missing-imports src/atextcrawler
isort src/atextcrawler
black -S -t py37 -l 79 src/atextcrawler
pybetter --exclude B004,B007,B008 src/atextcrawler
interrogate -i -I -m -v src/atextcrawler
```
## Release
There are no releases (currently).
## Useful commands
### Fetch a resource or a site manually
```
python -m atextcrawler.resource https://www.katesharpleylibrary.net/
python -m atextcrawler.site https://www.katesharpleylibrary.net/
```
### SQL
```
drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
-- stats: sites, paths, resources
select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
```

View file

@ -0,0 +1,64 @@
## Related work
* [collection of crawlers](https://github.com/adbar/awesome-crawler)
* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
### crawlers
* [acrawler](https://acrawler.readthedocs.io/en/latest/)
* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
* [repo](https://github.com/adbar/trafilatura)
* [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
* [scrapy](https://docs.scrapy.org/en/latest/)
* [heritrix3](https://github.com/internetarchive/heritrix3/)
* [YaCy](https://yacy.net/)
* [searchmysite](https://searchmysite.net/)
* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
#### general
* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
### sitemap parsers
* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
### url handling
* [courlan](https://pypi.org/project/courlan/)
### language detection
* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
* [guess_language](https://pypi.org/project/guess-language/)
* [cld3](https://github.com/google/cld3)
### text extraction
* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
### deduplication
* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
* remove paragraphs with more than 50% word-7-tuples encountered previously
### Extract more meta tags
* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
https://support.shareaholic.com/hc/en-us/articles/115003085186
### Date parsing dependent on language
* https://en.wikipedia.org/wiki/Date_format_by_country
* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
* https://pypi.org/project/dateparser/
* https://github.com/ovalhub/pyicu
* https://github.com/night-crawler/cldr-language-helpers
* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
ICU
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
* https://gist.github.com/dpk/8325992
* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
* https://unicode-org.github.io/icu/userguide/
* https://unicode-org.github.io/icu-docs/#/icu4c/
* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview

77
doc/source/devel/todo.md Normal file
View file

@ -0,0 +1,77 @@
## TODO
* parse html time tags
* site annotations:
* categories
* historical (no changes any more since n months)
* news
* local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
* allow for tls in elasticsearch config
* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
```
'&#8211;': '--',
'&ndash;': '--',
'': '--',
'&#8212;': '---',
'&mdash;': '---',
'—': '---',
'&#8230;': '...',
'&hellip;': '...',
'…': '...',
'&#8220;': '"',
'&#8221;': '"',
'&#8222;': '"',
'&#8243;': '"',
'&ldquo;': '"',
'&rdquo;': '"',
'&bdquo;': '"',
'&Prime;': '"',
'“':'"',
'”':'"',
'„':'"',
'″':'"',
'&#8216;':"'",
'&#8217;':"'",
'&#8242;':"'",
'&lsquo;':"'",
'&rsquo;':"'",
'&prime;':"'",
'':"'",
'':"'",
'':"'",
```
* normalize quotation marks and punctuation in general
* https://unicode-table.com/en/sets/quotation-marks/
* https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
* https://www.fileformat.info/info/unicode/category/Po/list.htm
* https://www.gaijin.at/en/infos/unicode-character-table-punctuation
* ⁝
* cancel crawls that take too long
* search for "TODO" in code
* feedparser has support for JSON feeds since commit
a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
(as of 2020-10-26 in "develop" branch, not part of a release)
the version names are 'json1' and 'json11'
* allow site URLs with path, e.g.
https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
* add more languages
## Ideas
* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
* [langid.py](https://github.com/saffsd/langid.py)
* [gain](https://github.com/gaojiuli/gain)
* [ruia](https://docs.python-ruia.org/)
* [demiurge](https://demiurge.readthedocs.io/)
* [cocrawler](https://github.com/cocrawler/cocrawler/)
* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)

View file

@ -0,0 +1,9 @@
Development
-----------
.. toctree::
:maxdepth: 2
devel/devel
devel/todo
devel/related_work

119
doc/source/elasticsearch.md Normal file
View file

@ -0,0 +1,119 @@
# Howto elasticsearch
## Prerequisites
On the host (virtualization host) we need:
```
# cat /etc/sysctl.d/virtual_memory.conf
vm.max_map_count=262144
# sysctl -p /etc/sysctl.d/virtual_memory.conf
```
If this cannot be done, change this file after installing or upgrading elasticsearch:
```
/usr/lib/sysctl.d/elasticsearch.conf
```
## Setup
### Install package
In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
We do a manual install. If you configure the apt repo instead, also think about setting
`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
```
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
dpkg -i elasticsearch-7.15.2-amd64.deb
systemctl daemon-reload
systemctl enable elasticsearch.service
systemctl start elasticsearch.service
```
First test:
```
http -j GET 127.0.0.1:9200/
```
### Storage
```
systemctl stop elasticsearch.service
mv /var/lib/elasticsearch/ /srv/
systemctl start elasticsearch.service
```
Edit /etc/elasticsearch/elasticsearch.yml
```
cluster.name: org.a-text.search
node.name: atext1
path.data: /srv/elasticsearch
path.logs: /var/log/elasticsearch
discovery.seed_hosts: ["atext1.multiname.org"]
xpack.security.enabled: true
xpack.security.authc.api_key.enabled: true
```
```
systemctl restart elasticsearch
```
The logfile now is at
```
/var/log/elasticsearch/org.a-text.search.log
```
### Setup passwords
Setup passwords:
```
# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
The passwords will be randomly generated and printed to the console.
Please confirm that you would like to continue [y/N]y
```
Copy output to /etc/elasticsearch/passwords and
```
chmod 400 /etc/elasticsearch/passwords
```
Check login as user elastic:
```
http --auth elastic:************** -j GET http://127.0.0.1:9200/
```
### Memory limitation
To limit memory usage
```
mkdir /etc/systemd/system/elasticsearch.service.d
cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
[Service]
LimitMEMLOCK=8G
systemctl stop elasticsearch
systemctl daemon-reload
systemctl start elasticsearch
EOF
```
and restart the service.
## Usage
Some useful requests:
### List indices
```
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
```
### Health
```
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
```
### Node attributes
```
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
```
### Create API key
```
http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
```

37
doc/source/index.rst Normal file
View file

@ -0,0 +1,37 @@
atextcrawler
============
atextcrawler is an asynchronous webcrawler indexing text
for literal and semantic search.
Its client-side counterpart is atextsearch_.
atextcrawler crawls and indexes selected websites.
It starts from a few seed sites and follows their external links.
Criteria defined in plugin code determine which linked sites (and
which of their resources) are (recursively) added to the pool.
atextcrawler is written in Python, runs a configurable number of
async workers concurrently (in one process), uses tensorflow for
embedding (paragraph-sized) text chunks in a (multi-)language model
and stores metadata in PostgreSQL and texts in elasticsearch.
.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
.. toctree::
:maxdepth: 2
:caption: Contents:
introduction
installation
maintenance
development
reference/modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

122
doc/source/installation.md Normal file
View file

@ -0,0 +1,122 @@
# Installation
Installation was only tested on Debian bullseye (on amd64).
The instructions below are for this system.
(Please adapt to other environments.)
## System packages
```
apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
```
The protobuf packages are required for python package gcld3 (see below).
## PostgreSQL database
We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
```
createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
```
## Elasticsearch
We need access to an elasticsearch instance (over TCP/IP).
Note: TLS is not yet supported, so install this service locally.
See [elasticsearch howto](elasticsearch.md).
## Tensorflow model server
We need access to a tensorflow model server (over TCP/IP).
It should serve `universal_sentence_encoder_multilingual`
or a similar language model.
Note: TLS is not yet supported, so install this service locally.
See [tensorflow howto](tensorflow_model_server.md).
## Setup virtualenv and install atextcrawler
```
apt install python3-pip
adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
su - atextcrawler
cat >>.bashrc <<EOF
export PYTHONPATH=\$HOME/repo/src
EOF
pip3 install --user pipenv
cat >>.profile <<EOF
PYTHONPATH=\$HOME/repo/src
PATH=\$HOME/.local/bin:$PATH
\$HOME/.local/bin/pipenv shell
EOF
exit
su - atextcrawler
git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
cd repo
pipenv sync
pipenv install --site-packages # for systemd
pre-commit install
```
Note: One of the dependencies, Python package `tldextract`,
uses this directory for caching:
```
$HOME/.cache/python-tldextract/
```
## Configure atextcrawler
As user `atextcrawler` execute
```
mkdir $HOME/.config
cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
```
Edit `$HOME/.config/atextcrawler/main.yaml`.
If you want to override a plugin, copy it to the plugins directory
and edit it, e.g.
```
cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
```
Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
Check (and print) the instance configuration:
```
python -m atextcrawler.config
```
## Test run
To see if it works, run `atextcrawler` from the command line:
```
python -m atextcrawler
```
You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
## Install systemd service
To make the service persistent, create a systemd unit file
`/etc/systemd/system/atextcrawler.service` with this content:
```
[Unit]
Description=atextcrawler web crawler
Documentation=https://gitea.multiname.org/a-text/atextcrawler
Requires=network.target
After=network-online.target
[Service]
Type=simple
User=atextcrawler
Group=atextcrawler
WorkingDirectory=/srv/atextcrawler/repo
Environment=PYTHONPATH=/srv/atextcrawler/repo/src
ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
TimeoutStartSec=30
ExecStop=/bin/kill -INT $MAINPID
TimeoutStopSec=180
Restart=on-failure
[Install]
WantedBy=multi-user.target
```
and
```
systemctl daemon-reload
systemctl enable atextcrawler
systemctl start atextcrawler
```

View file

@ -0,0 +1,66 @@
# Introduction
## What atextcrawler does:
* Start from a seed (white+black-)list of website base URLs
* Loop over sites selected by applying criteria to the content
of the site's start page
* Crawl the site, i.e. loop over resources of the site
* Extract plaintext content from the resource (html parsing is
optimized for html5); discard non-text content, but handle feeds
and sitemaps
* Extract internal and external links; external links contribute
to the site list
* Keep track of the sites and resources in a PostgreSQL database
* Store plaintext content of resources in an Elasticsearch index
* Store vector embeddings of plaintexts also in Elasticsearch
using tensorflow model server with a multilingual language model
## Architecture
There is only one python process running concurrently.
We use asyncio where possible (almost everywhere).
1. There is a queue of websites, see database table `site_queue`.
The queue is fed a) on first startup with seeds, b) manually
and c) from crawls which find external links.
When the queued is handled new sites are stored to table `site`.
New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
After the queue has been handled there is a delay
(`crawl.site_delay` seconds) before repetition.
1. Updating a site means: the start page is fetched and
criteria are applied to its content to determine whether
the site is relevant. (It is assumed that (non-)relevance is
obvious from the start page already.) If the site is relevant,
more information is fetched (e.g. sitemaps).
1. There is s a configurable number of crawler workers (config
`crawl.workers`) which concurrently crawl sites, one at a time
per worker. (During the crawl the site is marked as locked using
crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
Each crawl (with begin time, end time, number of found (new)
resources)) is stored in table `crawl`.
1. Crawls are either full crawls (including all paths reachable
through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
Feed crawls can happen more frequently (e.g. daily).
1. When a path is fetched it can result in a MetaResource (feed or
sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
1. If a MetaResource is fetched and it is a sitemap, its paths are
added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
1. Links between sites are stored in table `site_link`.
## Site annotations
Database table `site_annotation` can have any number of annotations
for a base_url. While crawling, these annotations are considered:
Blacklisting or whitelisting has precedence over function `site_filter`
(in plugin `filter_site`).
Annotations cannot be managed from within atextcrawler;
this requires another application, usually [`atextsearch`](https://TODO).
Each annotation requires a base_url of the annotated site and
if a site with this base_url exists in the `site` table,
it should also be associated with the site's id (column `site_id`).
## Limitations
* atextcrawler is not optimized for speed; it is meant to be run as a
background task on a server with limited resources
(or even an SBC, like raspberry pi, with attached storage)
* atextcrawler only indexes text, no other resources like images

23
doc/source/maintenance.md Normal file
View file

@ -0,0 +1,23 @@
# Maintenance
## Upgrading
```
su - atextcrawler
pip3 install --user --upgrade pipenv
cd repo
git pull
pipenv sync
systemctl restart atextcrawler
```
## Update tldextract
From time to time run (in the Python virtualenv):
```
tldextract --update
```
or
```
systemctl stop atextcrawler
rm -r $HOME/.cache/python-tldextract
systemctl start atextcrawler
```

View file

@ -0,0 +1,98 @@
# Tensorflow model server
## Setup server
Prepare:
```
apt install gnupg2
```
Add repo:
```
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
```
Install package:
```
apt update
apt install tensorflow-model-server
```
## Setup models
```
mkdir -p /srv/tensorflow/workdir
mkdir -p /srv/tensorflow/models
```
Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
```
# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
rm universal-sentence-encoder-multilingual_3.tar.gz
```
Check:
```
tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
```
Config file `/srv/tensorflow/config`:
```
model_config_list: {
config: {
name: "sentences",
base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
model_platform: "tensorflow"
model_version_policy: {latest{}},
},
config: {
... (next model)
},
}
```
## Systemd integration
Edit /etc/systemd/system/tensorflow.service
```
[Unit]
Description=tensorflow model server
After=network.target auditd.service
[Service]
Type=simple
WorkingDirectory=/srv/tensorflow/workdir
ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
KillMode=process
Restart=on-failure
RestartSec=30s
[Install]
WantedBy=multi-user.target
```
and
```
systemctl daemon-reload
systemctl enable tensorflow
systemctl start tensorflow
```
Check:
```
http -j GET http://localhost:9000/v1/models/sentences
```
## Usage
Show model details:
```
http -j GET http://localhost:9000/v1/models/sentences/metadata
```
## Docs
* `/usr/bin/tensorflow_model_server --help`
* https://github.com/tensorflow/serving/
* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
* https://github.com/hey-car/tensorflow-model-server
Datasets:
* https://www.tensorflow.org/datasets/catalog/overview

48
license.txt Normal file
View file

@ -0,0 +1,48 @@
ANTI-AUTHORITARIAN LICENSE version 1.0
________________________________________________________________________________
Obviously, this license is relevant to all who are bound by law.
The licensee ("you") must not be a commercial, military, clerical or
governmental entity. For this license the term "software" means the program
code, documentation as well as other data (for instance, language files).
Subject to the respective terms and conditions described below the licensee
is granted the non-exclusive and non-transferable license to:
A. make copies of the software
B. create derivative works ("modifications")
C. install and run copies or modifications of the software on any number of
servers, thereby making them usable for the licensee and possibly others
D. offer or give copies or modifications of the software, or parts of the
unmodified or modified software to others
For these permissions the respective conditions stated below must be met:
* For permission A condition 1 must be met.
* For permission B all of the conditions 1, 3, 4 must be met.
* For permission C all of the conditions 2, 3 must be met.
* For permission D all of the conditions 1, 2, 3, 4, 5 must be met.
These are the conditions:
1. You include this copyright notice and license in any copy or modification.
In files that contain a reference to it you preserve this reference.
2. You do not use this software or any modification of it for any commercial
purpose or for monetary gain, and also not for any military, governmental
or religious purpose; here with commercial purpose we mean activities which
have among their goals to make profit, be it monetary profit or any other
kind of profit that may entail or contribute to monetary profit.
3. Demos or screenshots of the modified or unmodified software must not be
published in any medium which requires the viewers to pay money in order
to see the contents; here money paid for mere internet connectivity (i.e.,
independent of the content supplier) is to be disregarded.
4. You do not impose any further restrictions on this software or any
derivative works beyond those restrictions herein.
5. The copy or modification must include source code, and must allow
distribution in source code as well as compiled form. The source code
must be the preferred form in which a programmer would modify the program.
Deliberately obfuscated source code is not allowed. Intermediate forms
such as the output of a preprocessor or translator are not allowed.
For this license itself, if re-used for other software, the following
copyright and license applies (copyheart license):
♡ Copying is an act of love. Please copy.

10
pyproject.toml Normal file
View file

@ -0,0 +1,10 @@
# TOML formatted file; see PEP 518
[tool.isort]
profile = "black"
#multi_line_output = 3
[tool.black]
line-length = 79
target_version = ['py39']
skip-string-normalization = true

View file

View file

@ -0,0 +1,12 @@
"""
atextcrawler application execution entry point.
"""
import asyncio
from .application import Application
from .config import Config
if __name__ == '__main__':
config = Config().get()
asyncio.run(Application(config).run())

View file

@ -0,0 +1,204 @@
"""
atextcrawler application.
"""
import asyncio
import importlib
import logging
import signal
import sys
from systemd.journal import JournalHandler
from .config import Config
from .crawl import CrawlWorker
from .db import PGPool
from .search import shutdown_engine, startup_engine
from .site import load_seeds, process_site_queue
plugin_names = ['filter_site', 'filter_site_path', 'filter_resource_path']
class Application:
"""
atextcrawler application.
The basic structure of the application is this:
* one site crawler works just on the site_queue: fetching start pages
of sites and storing updated site information in table sites
* N other CrawlWorkers each do this in a loop:
checkout a site that is due for crawl and crawl its resources;
they fill the site_queue
"""
running = True
def __init__(self, config=None):
if config is None:
config = Config().get()
self.config = config
self.instance_name = config['instance_name']
self.instance_type = config['instance_type']
log_level = getattr(
logging, config['log_level'].upper(), logging.CRITICAL
)
self.logger = logging.getLogger('atextcrawler')
self.logger.setLevel(log_level)
if self.instance_type == 'dev':
self.logger.addHandler(logging.StreamHandler())
else:
self.logger.addHandler(
JournalHandler(SYSLOG_IDENTIFIER=self.instance_name)
)
self.logger.propagate = False
self.channel = 'atextcrawler_' + self.config['instance_name']
msg = f'Instance "{self}" initializing'
self.logger.info(msg)
self.plugins = self._load_plugins()
def __str__(self):
return self.instance_name
def _load_plugins(self):
"""
Return a dict mapping plugin names to modules.
"""
modules = {}
old_path = sys.path
for name in plugin_names:
try:
plugins_dir = self.config['plugins_dir']
sys.path.insert(0, plugins_dir)
module = importlib.import_module(name)
msg = f'Loading plugin "{name}" from {plugins_dir}'
except:
module = importlib.import_module(
'atextcrawler.plugin_defaults.' + name
)
msg = f'Loading plugin "{name}" from default location'
self.logger.info(msg)
modules[name] = module
sys.path = old_path
return modules
async def run(self):
"""
Application lifecycle.
"""
await asyncio.gather(self.wait_for_shutdown(), self.startup())
await self.shutdown()
async def startup(self):
"""
Asynchronous startup.
"""
msg = f'Instance "{self}" starting components'
self.logger.info(msg)
self.search_engine = await startup_engine(self.config)
self.pgpool = await PGPool(self.config['postgresql'])
self.pool = self.pgpool.pool
await load_seeds(self.config, self.pool)
await reset_site_locks(self.pool)
worker_count = self.config['crawl']['workers']
self.workers = []
for worker_number in range(worker_count):
worker = await CrawlWorker(self, worker_number, self.pool)
self.workers.append(worker)
worker_coros = [worker.run() for worker in self.workers]
await asyncio.gather(
process_site_queue(self, self.pool),
self.handle_notifications(),
*worker_coros,
)
async def wait_for_shutdown(self):
"""
Create a shutdown event (:class:`asyncio.Event`) and wait for it.
The event will be set by a signal handler for SIGINT
and SIGTERM signals (see :meth:`Application.handle_shutdown_signal`).
"""
self.shutdown_event = asyncio.Event()
for sig in (signal.SIGINT, signal.SIGTERM):
asyncio.get_running_loop().add_signal_handler(
sig, self.handle_shutdown_signal
)
self.logger.debug(f'{self} waiting for shutdown event')
await self.shutdown_event.wait()
self.logger.info(f'Instance "{self}" shutdown event')
def handle_shutdown_signal(self):
"""
Handle shutdown signal.
"""
if self.shutdown_event.is_set():
return
self.shutdown_event.set()
self.running = False
async def shutdown(self):
"""
Asynchronous shutdown.
"""
self.logger.debug(f'Instance "{self}" shutting down')
await self.notify_conn.remove_listener(
self.channel, self.listen_callback
)
await self.pool.release(self.notify_conn)
for worker in self.workers:
await worker.shutdown()
await shutdown_engine(self.search_engine)
await self.pgpool.shutdown()
self.logger.info(f'Instance "{self}" shutdown completed')
async def handle_notifications(self):
"""
Handle notifications using PostgreSQL's NOTIFY/LISTEN.
"""
self.notify_conn = await self.pool.acquire()
await self.notify_conn.add_listener(self.channel, self.listen_callback)
def listen_callback(self, *args):
"""
Handle notify event from PostgreSQL.
"""
channel = args[2]
if channel != self.channel:
return
message = args[3]
if message.startswith('site_update '):
try:
site_id = int(message.removeprefix('site_update '))
for worker in self.workers:
if worker.site and site_id == worker.site.id_:
msg = (
f'Cancelling worker {worker.worker_number}'
f' (site={site_id}) due to site_update'
)
self.logger.info(msg)
worker.running = False
except:
pass
async def sleep(self, duration, t_slice=3):
"""
Sleep for *duration* seconds while self.running.
Check self.running every *t_slice* seconds.
"""
remaining = duration
while remaining > 0 and self.running:
await asyncio.sleep(min(t_slice, remaining))
remaining -= t_slice
async def reset_site_locks(pool):
"""
Remove locks leftover from last run: Set crawl_active=false for all sites.
This is relevant when the application was not shutdown properly (e.g.
when the process was killed).
"""
async with pool.acquire() as conn:
sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true"
await conn.execute(sql)

View file

@ -0,0 +1,7 @@
The recommended language tags to use in webpages are from
the IANA Language Subtag Registry (BCP47), see:
https://www.w3.org/International/questions/qa-html-language-declarations
https://r12a.github.io/app-subtags/
wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_ | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //'

View file

@ -0,0 +1,219 @@
aa
ab
ae
af
ak
am
an
ar
as
av
ay
az
ba
be
bg
bh
bi
bm
bn
bo
br
bs
ca
ca
ce
ch
co
cr
cs
cu
cu
cu
cu
cu
cv
cy
da
de
dv
dv
dv
dz
ee
el
en
eo
es
es
et
eu
fa
ff
fi
fj
fo
fr
fy
ga
gd
gd
gl
gn
gu
gv
ha
he
hi
ho
hr
ht
ht
hu
hy
hz
ia
id
ie
ie
ig
ii
ii
ik
io
is
it
iu
ja
jv
ka
kg
ki
ki
kj
kj
kk
kl
kl
km
kn
ko
kr
ks
ku
kv
kw
ky
ky
la
lb
lb
lg
li
li
li
ln
lo
lt
lu
lv
mg
mh
mi
mk
ml
mn
mr
ms
mt
my
na
nb
nb
nd
nd
ne
ng
nl
nl
nn
nn
no
nr
nr
nv
nv
ny
ny
ny
oc
oj
om
or
os
os
pa
pa
pi
pl
ps
ps
pt
qu
rm
rn
ro
ro
ro
ru
rw
sa
sc
sd
se
sg
si
si
sk
sl
sm
sn
so
sq
sr
ss
st
su
sv
sw
ta
te
tg
th
ti
tk
tl
tn
to
tr
ts
tt
tw
ty
ug
ug
uk
ur
uz
ve
vi
vo
wa
wo
xh
yi
yo
za
za
zh
zu

File diff suppressed because it is too large Load diff

337
src/atextcrawler/config.py Normal file
View file

@ -0,0 +1,337 @@
"""
Configuration loader and validator.
"""
import os
import re
import sys
from io import TextIOBase
from pathlib import Path
from typing import Any, Optional, Union
from voluptuous import All
from voluptuous import Any as VAny
from voluptuous import Invalid, Length, Range, Required, Schema, Url
from yaml import load
try:
from yaml import CLoader as Loader # type: ignore
except ImportError:
from yaml import Loader # type: ignore
class ConfigError(Exception):
"""
Application configuration error.
"""
def __init__(self, err):
self.msg = str(err)
def __str__(self):
return f'Application configuration error: {self.msg}'
class Config:
"""
Application configuration.
Access the full application configuration using :meth:`get`.
It is a dictionary with these keys:
* 'directory': the configuration directory being used
* 'main': the main configuration from main.yaml, but
postgresql configuration may be overriden by environment
variable ATEXTCRAWLER_POSTGRESQL
"""
config = None
@classmethod
def get(
cls,
out: Optional[TextIOBase] = None,
) -> Optional[dict]:
"""
Load and validate app configuration if not already done; return it.
On errors print them to *out* and if out is sys.stdout, then
also exit with exit code 2. Otherwise just return None.
"""
if cls.config:
return cls.config
if out is None:
out = sys.stdout # type: ignore
_config = _load_config()
msg = None
if isinstance(_config, ConfigError):
msg = f'ERROR: configuration could not be loaded: {_config}'
else:
config = _validate_config(_config)
if isinstance(config, ConfigError):
config_dir = _config.get('config_dir')
msg = (
f'ERROR: invalid configuration in {config_dir}:'
f' {config}'
)
if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
print(msg, file=out)
if out == sys.stdout:
sys.exit(2)
else:
return None
config['postgresql']['min_size'] = config['crawl']['workers'] + 2
config['postgresql']['max_size'] = config['crawl']['workers'] + 2
cls.config = config
return config
def _load_config() -> Union[ConfigError, dict]:
"""
Load configuration; search in multiple directories.
We search these locations; the first location containing main.yaml
will be used::
* a directory defined in environment variable ATEXTCRAWLER_CONF
* subdir .config/atextcrawler in the user's home (`$HOME`)
* /etc/atextcrawler
In the same directory where this main.conf is located a subdirectory
'plugins' must exist and contain the configurations of plugins.
On failure return the first error and None.
Otherwise return None and a dict with these keys:
* `directory`: the used configuration directory
* `main`: the main application configuration
* `plugins`: a dict mapping plugins names to plugin configurations
"""
Path(__file__).parent.parent
config_dirs = []
if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
config_dirs.append(Path(env_conf))
if env_home := os.environ.get('HOME'):
config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
config_dirs.append(Path('/etc/atextcrawler'))
for config_dir in config_dirs:
main_yaml_path = config_dir / 'main.yaml'
if main_yaml_path.exists():
break
else:
locs = ', '.join([str(loc) for loc in config_dirs if loc])
msg = (
f'Missing main.yaml in all config locations: {locs}\n'
f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
f' to define a custom config directory.'
)
return ConfigError(msg)
# load main.yaml
try:
with main_yaml_path.open() as main_yaml:
main_config = load(main_yaml.read(), Loader=Loader)
except Exception as err:
return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
# main_config must be a dict
if not isinstance(main_config, dict):
return ConfigError(f'File {main_yaml_path} must contain a dictionary')
# postgresql config from environment has precedence
postgresql_config = _get_env_postgresql()
if isinstance(postgresql_config, ConfigError):
return postgresql_config
main_config['postgresql'] = postgresql_config or main_config['postgresql']
main_config['config_dir'] = str(config_dir)
return main_config
def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
"""
Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
Return an error or the PostgreSQL config (which can be None if
the environment variable is not defined.
"""
env_var = 'ATEXTCRAWLER_POSTGRESQL'
value = os.environ.get(env_var, '').strip()
if not value:
return None
param_names = (
'host',
'port',
'database',
'user',
'password',
'schema_name',
)
re_dsn = re.compile(
'((' + '|'.join(param_names) + ')'
'=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes
'|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes
'|([^"\' ]*)' # value unquoted
')( |$))+?'
)
params = {}
for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
params[varname] = (
v3
or (v1 or '').replace('\\"', '"')
or (v2 or '').replace("\\'", "'")
)
if 'host' not in params:
params['host'] = 'localhost'
if 'port' not in params:
params['port'] = '5432'
if 'schema_name' not in params:
params['schema_name'] = 'public'
for name in param_names:
if name not in params:
return ConfigError(
f'Missing {name} in environment variable {env_var}'
)
else:
params['port'] = int(params['port'])
return params
def _validate_config(config: Any) -> Union[ConfigError, dict]:
"""
Validate the given configuration and fill in default values.
If invalid, return only the first error.
Otherwise return the configuration with added default values.
"""
try:
return schema_main(config)
except Exception as err:
return ConfigError(err)
def plugins_dir(config):
"""
Validate plugins directory (absolute or relative path).
If it is a relative path, prepend the config_dir.
"""
config_dir = config['config_dir']
plugins_dir = config['plugins_dir']
if plugins_dir.startswith('/'):
try:
plugins_dir = Path(plugins_dir)
except:
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
else:
try:
plugins_dir = str(Path(config_dir) / Path(plugins_dir))
config['plugins_dir'] = plugins_dir
except:
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
if not (Path(plugins_dir) / '__init__.py').exists():
raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
return config
def postgresql_identifier(value):
"""
Validate a PostgreSQL identifier.
"""
if not isinstance(value, str) or not re.match(
'^[a-z][a-z0-9_]{0,30}$', value
):
raise Invalid(
f'Invalid PostgreSQL identifier "{value}", '
f'pattern must be: [a-z][a-z0-9_]{0,30}'
)
return value
def positive_number(value):
"""
Validate a positive number (int or float).
"""
if (isinstance(value, int) or isinstance(value, float)) and value > 0:
return value
raise Invalid('Not a positive number')
schema_postgresql = Schema(
{
Required('host'): All(str, Length(min=1)),
Required('port', default=5432): All(int, Range(min=0, max=65535)),
Required('database'): All(str, Length(min=1)),
Required('user'): All(str, Length(min=1)),
Required('password'): str,
Required('schema_name', default='public'): postgresql_identifier,
}
)
schema_crawl = Schema(
{
Required('workers', default=10): All(int, Range(min=0, max=1000)),
Required('site_delay', default=600): positive_number,
Required('site_revisit_interval', default=3600): positive_number,
Required('resource_delay', default=5): positive_number,
Required('full_crawl_interval', default=864000): positive_number,
Required('feed_crawl_interval', default=86400): positive_number,
}
)
schema_elasticsearch = Schema(
{
Required('host'): All(str, Length(min=1)),
Required('api_key'): All(str, Length(min=1)),
Required('id'): All(str, Length(min=1)),
Required('index_base_name'): All(str, Length(min=1)),
}
)
schema_tensorflow = Schema(
{
Required('model_server_endpoint'): Url(),
}
)
schema_main = Schema(
All(
{
Required('config_dir'): All(str, Length(min=1)),
Required(
'instance_name', default='atextcrawler'
): postgresql_identifier,
Required('instance_type', default='prod'): VAny(
'dev',
'staging',
'prod',
),
Required('log_level', default='info'): VAny(
'critical',
'error',
'warning',
'info',
'debug',
),
Required('plugins_dir', default='plugins'): All(
str, Length(min=1)
),
Required('postgresql'): schema_postgresql,
Required('crawl'): schema_crawl,
Required('elasticsearch'): schema_elasticsearch,
Required('tensorflow'): schema_tensorflow,
},
plugins_dir,
)
)
if __name__ == '__main__':
from pprint import pprint
pprint(Config().get())

215
src/atextcrawler/crawl.py Normal file
View file

@ -0,0 +1,215 @@
"""
Crawl a site.
"""
import logging
from datetime import datetime
import aiohttp
from .models import Crawl
from .resource import ResourceFetcher, get_site_path, process_site_path
from .site import (
RobotsInfo,
checkin_site,
checkout_site,
fetch_feeds,
process_site,
update_site,
)
from .tensorflow import TensorFlow
logger = logging.getLogger(__name__)
class CrawlWorker:
"""
Worker fetching sites, crawling their resources and storing statistics.
"""
def __init__(self, app, worker_number, pool):
self.app = app
self.worker_number = worker_number
self.pool = pool
self.site_delay = self.app.config['crawl']['site_delay']
self.resource_delay = self.app.config['crawl']['resource_delay']
self.site = None
self.crawl = None
self.running = True # do crawl
def __await__(self):
return self.__ainit__().__await__()
async def __ainit__(self):
await self.startup()
return self
async def startup(self):
"""
Asynchronous startup.
"""
logger.info(f'Starting worker {self.worker_number}')
self.conn = await self.pool.acquire()
self.session = aiohttp.ClientSession()
self.fetcher = ResourceFetcher(self.session)
self.tf = TensorFlow(self.app, self.session)
async def shutdown(self):
"""
Asynchronous shutdown.
"""
logger.info(f'Shutting down worker {self.worker_number}')
await self.session.close()
await self.pool.release(self.conn)
async def run(self):
"""
Worker loop: fetch a site, crawl its resources and store statistics.
If no site needs to be crawled, sleep for self.site_delay seconds
(configured in crawl.site_delay).
"""
await self.app.sleep(2)
while self.app.running and self.running:
self.site, is_full, more = await checkout_site(self.app, self.conn)
if not self.site:
msg = f'Worker {self.worker_number}: sites exhausted'
logger.debug(msg)
if not more:
await self.app.sleep(self.site_delay)
continue
self.crawl = await get_or_create_crawl(
self.conn, self.site.id_, is_full
)
try:
if is_full:
site_upd, _ = await update_site(
self.app,
self.fetcher,
self.conn,
self.site.base_url,
site=self.site,
)
if site_upd and site_upd.crawl_enabled:
self.site = site_upd
await process_site(
self.fetcher,
self.conn,
self.site,
)
elif self.site.crawl_enabled:
await fetch_feeds(self.fetcher, self.conn, self.site)
if self.site.crawl_enabled:
await self.crawl_resources()
except:
msg = (
f'Worker {self.worker_number} failed crawl'
f' {self.crawl.id_} of site {self.site.id_}'
f' ({self.site.base_url})'
)
logger.exception(msg)
await self.crawl.finish(
self.conn, self.app.running and self.running
)
await checkin_site(self.app, self.conn, self.site, self.crawl)
msg = (
f'Worker {self.worker_number} finished crawl'
f' {self.crawl.id_}'
)
logger.debug(msg)
self.site = None
# if we were cancelled, but the app is still running, run again
if self.app.running:
self.running = True
msg = f'Closing crawler {self.worker_number}'
logger.debug(msg)
async def crawl_resources(self):
"""
Loop over resources of the site and process them. Collect statistics.
All workers operate on distinct sites, so no need for locking here.
"""
crawl_type = 'full' if self.crawl.is_full else 'feed'
msg = (
f'Worker {self.worker_number} beginning'
f' {crawl_type} crawl {self.crawl.id_}'
f' of site {self.site.id_} ({self.site.base_url})'
)
logger.info(msg)
resource_delay = self.resource_delay
robots = await RobotsInfo(self.site.base_url)
if robots.delay:
resource_delay = robots.delay
while self.app.running and self.running:
site_path = await get_site_path(
self.conn,
self.site,
self.crawl.t_begin,
only_new=not self.crawl.is_full,
)
if not site_path:
msg = (
f'Worker {self.worker_number} ending crawl'
f' {self.crawl.id_}: paths exhausted'
)
logger.info(msg)
return
try:
sp_filter = self.app.plugins['filter_site_path'].sp_filter
if sp_filter(self.site, site_path.path, robots):
is_new_resource = await process_site_path(
self.app,
self.worker_number,
self.conn,
self.fetcher,
self.tf,
self.site,
site_path,
)
if is_new_resource:
self.crawl.n_resources_new += 1
if is_new_resource is not None:
self.crawl.n_resources += 1
await self.app.sleep(resource_delay)
else:
sql = (
"UPDATE site_path SET"
" last_visit=now() at time zone 'UTC',"
" filtered=true"
" WHERE id=$1"
)
await self.conn.execute(sql, site_path.id_)
except:
msg = (
f'Worker {self.worker_number} processing path failed'
f' in crawl {self.crawl.id_}: {site_path}'
)
logger.exception(msg)
site_path.ok_count -= 1
await site_path.save(self.conn)
msg = (
f'Worker {self.worker_number}: stopped crawl' f' {self.crawl.id_}'
)
logger.info(msg)
async def get_or_create_crawl(conn, site_id, is_full=True) -> Crawl:
"""
Return a new or existing+unfinished crawl.
If an existing crawl is found, return it, disregarding whether
it is a full crawl or not.
"""
sql = "SELECT * FROM crawl WHERE site_id=$1 AND t_end is null LIMIT 1"
if row := await conn.fetchrow(sql, site_id):
return await Crawl().load_from_row(row)
else:
# create a new crawl
crawl = Crawl(
site_id=site_id,
is_full=is_full,
t_begin=datetime.utcnow(),
)
await crawl.save(conn)
return crawl

162
src/atextcrawler/db.py Normal file
View file

@ -0,0 +1,162 @@
"""
PostgreSQL connectivity.
PGPool can be used as context manager. It takes postgresql configuration
parameters and gives a connection pool.
"""
import logging
import sys
from io import TextIOBase
from pathlib import Path
from traceback import format_exc
from typing import Dict
import asyncpg
from .utils.json import json_dumps, json_loads
logger = logging.getLogger(__name__)
class PGPool:
"""
Database connectivity: Provide a connection pool.
Can be used either as async context manager (giving a pool),
or as a class using async init and the shutdown method and
having the pool attribute.
After startup self.pool contains a PostgreSQL connection pool
(instance of :class:`asyncpg.pool.Pool`).
Startup also runs schema migrations (cf. directory `migrations`).
"""
def __init__(
self,
postgresql_config: dict,
out: TextIOBase = None,
check: bool = True,
) -> None:
self.conf = postgresql_config
self.out = out or sys.stdout
self.check = check
self.pool = None
def __await__(self):
return self.__ainit__().__await__()
async def __ainit__(self):
await self.__aenter__()
return self
async def __aenter__(self):
"""
Return the connection pool after an optional check.
The check tests basic database access and runs missing migrations.
If the check fails, return None.
"""
pool_params = {
key: val
for key, val in self.conf.items()
if key
in (
'host',
'port',
'database',
'user',
'password',
'max_size',
'min_size',
)
}
pool_params['command_timeout'] = 30
self.pool = await asyncpg.create_pool(**pool_params, init=self._init)
if self.check:
async with self.pool.acquire() as conn:
if await self.check_or_migrate(conn):
return self.pool
@staticmethod
async def _init(conn) -> None:
"""
Add JSON encoding and decoding to the given connection.
"""
await conn.set_type_codec(
'jsonb',
encoder=json_dumps,
decoder=json_loads,
schema='pg_catalog',
)
async def __aexit__(self, exc_type, exc, tb) -> None:
"""
Close the connection pool.
"""
await self.shutdown()
async def shutdown(self):
"""
Close the pool.
"""
await self.pool.close()
async def check_or_migrate(self, conn: asyncpg.Connection) -> bool:
"""
Check database connectivity.
Return whether database connectivity is working.
"""
row = await conn.fetchrow('SELECT 1+1 AS result')
if not row or row.get('result') != 2:
msg = 'Database SELECT 1+1 not working; missing privileges?'
print(msg, file=self.out)
logger.critical(msg)
return False
# determine current schema_version
try:
sql = "SELECT value::int FROM kvs WHERE key='schema_version'"
schema_version = await conn.fetchval(sql)
except:
schema_version = 0
# run missing migrations
migrations = get_migrations()
for number, text in sorted(migrations.items()):
if number > schema_version:
cmds = text.split('\n----\n')
for cmd in cmds:
if not cmd.strip():
continue
try:
await conn.execute(cmd)
except:
msg = (
f'Exception during migration {number} in '
f'statement\n{cmd}'
)
print(msg, file=self.out)
logger.critical(msg)
print(format_exc(), file=self.out)
logger.critical(format_exc())
return False
# return success
return True
def get_migrations() -> Dict[int, str]:
"""
Return migrations (number and text content of migration file).
"""
migrations_dir = Path(__file__).parent / 'migrations'
migrations = {}
for migration_file in migrations_dir.glob('*.sql'):
migration_number = int(migration_file.name[:-4])
with migration_file.open() as mig_file:
content = mig_file.read()
migrations[migration_number] = content
return migrations

View file

@ -0,0 +1,297 @@
CREATE TABLE kvs (
id bigserial PRIMARY KEY,
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
key varchar(200) NOT NULL UNIQUE,
value jsonb
)
----
COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry';
----
COMMENT ON COLUMN kvs.key IS 'Key';
----
COMMENT ON COLUMN kvs.value IS 'Value';
----
COMMENT ON TABLE kvs IS 'Simple key-value store';
----
INSERT INTO kvs (key, value) VALUES ('schema_version', '1');
----
CREATE TABLE site (
id bigserial PRIMARY KEY,
canonical_url varchar(200),
base_url varchar(200) NOT NULL,
base_urls varchar(200)[] NOT NULL,
domains varchar(100)[],
ips inet[] NULL,
crawl_enabled bool NOT NULL DEFAULT false,
crawl_active bool NOT NULL DEFAULT false,
next_full_crawl timestamp,
next_feed_crawl timestamp,
last_update timestamp,
last_pub timestamp,
pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb,
langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[],
alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb,
title varchar(200),
description varchar(2000),
keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[],
linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb,
meta_info jsonb NOT NULL DEFAULT '{}'::jsonb,
boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb
)
----
CREATE INDEX site__base_url ON site (base_url)
----
CREATE INDEX site__base_urls ON site (base_urls)
----
CREATE INDEX site__domains ON site (domains)
----
CREATE INDEX site__ips ON site (ips)
----
CREATE INDEX site__next_full_crawl ON site (next_full_crawl)
----
CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl)
----
CREATE INDEX site__langs ON site (langs)
----
CREATE INDEX site__title ON site (title)
----
CREATE INDEX site__description ON site (description)
----
CREATE INDEX site__keywords ON site (keywords)
----
COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)'
----
COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content'
----
COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content'
----
COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls'
----
COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed'
----
COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress'
----
COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null'
----
COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null'
----
COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)'
----
COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site'
----
COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date'
----
COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)'
----
COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes'
----
COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags'
----
COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags'
----
COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags'
----
COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)'
----
COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information'
----
COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages'
----
COMMENT ON TABLE site IS 'Website'
----
CREATE TABLE site_queue (
id bigserial PRIMARY KEY,
src bigint NULL REFERENCES site(id) ON DELETE CASCADE,
url varchar(200) NOT NULL,
link_text varchar(100),
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc')
)
----
CREATE INDEX site_queue__url ON site_queue (url)
----
COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions'
----
COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path'
----
COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site'
----
COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry'
----
COMMENT ON TABLE site_queue IS 'Queued site URLs'
----
CREATE TABLE site_feed (
id bigserial PRIMARY KEY,
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
url varchar(200) NOT NULL,
etag text,
modified varchar(50),
t_visit timestamp,
t_content timestamp,
version varchar(10),
title varchar(200),
description text,
fail_count smallint NOT NULL DEFAULT 0
)
----
CREATE INDEX site_feed__site ON site_feed (site_id)
----
CREATE INDEX site_feed__t_content ON site_feed (t_content)
----
COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found'
----
COMMENT ON COLUMN site_feed.url IS 'URL of the feed'
----
COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed'
----
COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed'
----
COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval'
----
COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival'
----
COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival'
----
CREATE TABLE site_link (
id bigserial PRIMARY KEY,
src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
link_text varchar(100)
)
----
ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst)
----
CREATE INDEX site_link__src ON site_link (src)
----
CREATE INDEX site_link__dst ON site_link (dst)
----
COMMENT ON COLUMN site_link.src IS 'Source site'
----
COMMENT ON COLUMN site_link.dst IS 'Destination site'
----
COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry'
----
COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site'
----
COMMENT ON TABLE site_link IS 'Cross-site link'
----
CREATE TABLE resource (
id bigserial PRIMARY KEY,
simhash bigint,
content_type varchar(50),
last_change timestamp,
text_len int,
lang char(2),
title varchar(200),
summary varchar(2000)
)
----
COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource'
----
COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header'
----
COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource'
----
COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters'
----
COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code'
----
COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)'
----
COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)'
----
COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)'
----
CREATE TABLE site_path (
id bigserial PRIMARY KEY,
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
path varchar(400) NOT NULL,
last_visit timestamp,
filtered bool NOT NULL DEFAULT false,
ok_count smallint NOT NULL DEFAULT 0,
canonical bool,
resource_id bigint REFERENCES resource(id) ON DELETE CASCADE
)
----
ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path)
----
CREATE INDEX site_path__site_path ON site_path (site_id, path)
----
CREATE INDEX site_path__resource ON site_path (resource_id)
----
COMMENT ON COLUMN site_path.site_id IS 'Site id'
----
COMMENT ON COLUMN site_path.path IS 'Path'
----
COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival'
----
COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed'
----
COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival'
----
COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval'
----
COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources'
----
CREATE TABLE crawl (
id bigserial PRIMARY KEY,
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
is_full bool NOT NULL DEFAULT false,
t_begin timestamp,
t_end timestamp,
n_resources int NOT NULL DEFAULT 0,
n_resources_new int NOT NULL DEFAULT 0
)
----
CREATE INDEX crawl__site ON crawl (site_id)
----
CREATE INDEX crawl__t_begin ON crawl (t_begin)
----
COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled'
----
COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl'
----
COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl'
----
COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin'
----
COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl'
----
COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl'
----
COMMENT ON TABLE resource IS 'Crawl of resources on a site'
----
CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale')
----
COMMENT ON TYPE site_annotation_type IS 'Type of site annotation'
----
CREATE TABLE site_annotation (
id bigserial PRIMARY KEY,
site_id bigint REFERENCES site(id) ON DELETE SET NULL,
base_url varchar(200) NOT NULL,
ann_type site_annotation_type NOT NULL,
ann_content JSONB,
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc')
)
----
CREATE INDEX site_annotation__site ON site_annotation (site_id)
----
CREATE INDEX site_annotation__base_url ON site_annotation (base_url)
----
COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated'
----
COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated'
----
COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type'
----
COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content'
----
COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update'
----
COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'

610
src/atextcrawler/models.py Normal file
View file

@ -0,0 +1,610 @@
"""
Data Models.
"""
import logging
from dataclasses import InitVar, asdict, dataclass, field, fields
from datetime import date, datetime
from itertools import chain
from typing import Any, ClassVar, Optional
import tldextract
from asyncpg import Connection
from .search import delete_resource
from .utils.durl import Durl, get_url_variants
from .utils.link import extract_domain
from .utils.similarity import get_simhash, simhash_to_bigint
logger = logging.getLogger(__name__)
class ModelBase:
"""
Abstract base class for models.
Execute SQL to load, save, delete instances using asyncpg.
"""
table: ClassVar
id_: Optional[int] = 0
async def load(self, conn: Connection, id_: int) -> Optional[Any]:
"""
If loading fails, return None.
"""
sql = f"SELECT * FROM {self.table} WHERE id=$1"
row = await conn.fetchrow(sql, id_)
if not row:
return None
return await self.load_from_row(row)
async def load_from_row(self, row):
"""
If row is None, return None.
"""
if not row:
return None
data = dict(row)
self.id_ = data.pop('id')
self.__init__(**data)
return self
async def save(self, conn: Connection) -> None:
"""
Save the instance (update if self.id_ is set, else insert).
"""
data = asdict(self)
# logger.debug(f'Save {self}: id_={self.id_}')
if self.id_: # update
cols = ', '.join(data.keys())
upds = ', '.join(
[f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
)
val_id = f'${len(data) + 1}'
sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
await conn.execute(sql, *data.values(), self.id_)
else: # insert
cols = ', '.join(data.keys())
vals = ', '.join([f'${i + 1}' for i in range(len(data))])
sql = (
f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
f" RETURNING id"
)
self.id_ = await conn.fetchval(sql, *data.values())
def asdict(self):
"""
Return instance data as dictionary.
"""
return asdict(self)
async def delete(self, conn: Connection) -> None:
"""
Delete the object if it has an id_.
"""
if self.id_:
sql = f"DELETE FROM {self.table} WHERE id=$1"
await conn.execute(sql, self.id_)
class ResourceError:
"""
Error encountered while trying to fetch a resource.
ResourceError is used for cases when fetching a resource fails.
"""
def __init__(self, msg, status=None, headers=None):
self.msg = msg
self.status = status
self.headers = headers
def __repr__(self):
return f'ResourceError: {self.msg}'
class ResourceRedirect:
"""
A resource containing a redirect.
"""
def __init__(self, urls):
self.urls = urls
@dataclass
class TextResource(ModelBase):
"""
TextResource (without path).
TextResource models web resources with relevant text content.
They are instantiated in modules page, document, ...; their metadata
are stored in table `resource` and the text content is stored with the
search engine.
Do not confuse with SitePath: Several SitePath instances
may point to a TextResource. The TextResource holds the actual content.
If we are not dealing with the startpage of a new site,
the init_fields dict usually will contain the site to which
the resource belongs.
"""
table: ClassVar = 'resource'
init_fields: InitVar[dict] = None # additional fields after fetching
search_fields: InitVar[dict] = None # additional fields for indexing
# database fields
simhash: Optional[int] = None
content_type: Optional[str] = None
last_change: Optional[datetime] = None
text_len: int = 0
lang: Optional[str] = None
title: Optional[str] = None
summary: Optional[str] = None
def __post_init__(self, init_fields, search_fields):
if init_fields is None:
init_fields = {}
self.init_fields = init_fields
if search_fields is None:
search_fields = {}
self.search_fields = search_fields
self.site = self.init_fields.get('site')
self.site_id = self.site.id_ if self.site else None
self._update_simhash()
def __str__(self):
return (
f'TextResource(id={self.id_},'
f' site_id={self.site_id},'
f' type={self.content_type})'
)
def _update_simhash(self):
"""
Update the simhash of the resource from its text content.
"""
if self.simhash is None:
text = self.search_fields.get('text', '')
self.simhash = simhash_to_bigint(get_simhash(text))
async def save(self, conn: Connection):
"""
Save the instance, extending the parent's method.
"""
self.content_type = (
self.content_type[:50] if self.content_type else None
)
self.title = self.title[:200] if self.title else None
self.summary = self.summary[:400] if self.summary else None
self._update_simhash()
if self.last_change is None:
self.last_change = datetime.utcnow()
await super().save(conn)
async def update_from_resource(self, upd: 'TextResource'):
"""
Update self with values from another resource.
"""
names = [field.name for field in fields(self)]
for name in names:
cur_val = getattr(self, name)
upd_val = getattr(upd, name)
if not cur_val and upd_val is not None:
setattr(self, name, upd_val)
init_names = [
'headers',
'redirects',
'links_int',
'links_ext',
'shortlinks',
'canonical',
#'head',
]
self.init_fields = upd.init_fields
self.search_fields = upd.search_fields
# for init_name in init_names:
# cur_val = self.init_fields.get(init_name)
# upd_val = upd.init_fields.get(init_name)
# if not cur_val and upd_val is not None:
# self.init_fields[init_name] = upd_val
@dataclass
class MetaResource(ModelBase):
"""
Parent class for Feed, Sitemap, SitemapIndex.
MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
Their instances are not stored. Note: class Feed contains feed meta data
and is stored in the database.
"""
@dataclass
class SitemapIndex(MetaResource):
"""
A SitemapIndex meta resource.
Just a list of the siteap URLs, nothing more.
"""
sitemaps: list = field(default_factory=list)
@dataclass
class Sitemap(MetaResource):
"""
A Sitemap meta resource.
Just a list of the resulting links, nothing more.
"""
urls: list = field(default_factory=list)
@dataclass
class Feed(MetaResource):
"""
A site's feed (RSS, Atom , ...).
"""
table: ClassVar = 'site_feed'
entries: InitVar[list] = None
site_id: Optional[int] = None
url: Optional[str] = None
etag: Optional[str] = None
modified: Optional[str] = None
t_visit: Optional[datetime] = None
t_content: Optional[datetime] = None
version: Optional[str] = None
title: Optional[str] = None
description: Optional[str] = None
fail_count: int = 0
def __post_init__(self, entries):
self.entries = entries
def __str__(self):
return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'
async def save(self, conn: Connection):
"""
Save, trying to merge with existing entry matching on site_id and url.
"""
if not self.site_id or not self.url:
msg = f'Saving feed failed: missing site_id of url'
logger.error(msg)
return
sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
self.id_ = await conn.fetchval(sql, self.site_id, self.url)
await super().save(conn)
def debug(self) -> str:
"""
Return the instance data asa string for debug print output.
"""
return (
f'Feed:\n'
f'- id: {self.id_}\n'
f'- site_id: {self.site_id}\n'
f'- url: {self.url}\n'
f'- etag: {self.etag}\n'
f'- modified: {self.modified}\n'
f'- t_visit: {self.t_visit}\n'
f'- t_content: {self.t_content}\n'
f'- version: {self.version}\n'
f'- title: {self.title}\n'
f'- description: {self.description}\n'
f'- fail_count: {self.fail_count}\n'
f'- entries: {self.entries}'
)
@dataclass
class Site(ModelBase):
"""
Website.
"""
table: ClassVar = 'site'
base_durl: InitVar[Durl] = None
feeds: InitVar[dict] = None
links_ext: InitVar[dict] = None
links_int: InitVar[dict] = None
startpage_text: InitVar[str] = None
canonical_url: Optional[str] = None
base_url: Optional[str] = None
base_urls: list[str] = field(default_factory=list)
domains: list[str] = field(default_factory=list)
ips: Optional[list[str]] = None
crawl_enabled: bool = False
crawl_active: bool = False
next_full_crawl: Optional[datetime] = None
next_feed_crawl: Optional[datetime] = None
last_update: Optional[datetime] = None
last_pub: Optional[datetime] = None
pub_dates: Optional[dict[str, str]] = None
langs: list[str] = field(default_factory=list)
alt_langs: dict[str, str] = field(default_factory=dict)
title: Optional[str] = None
description: Optional[str] = None
keywords: list[str] = field(default_factory=list)
linkbacks: dict[str, str] = field(default_factory=dict)
meta_info: dict = field(default_factory=dict)
boilerplate_texts: list[str] = field(default_factory=list)
def __post_init__(
self,
base_durl: Durl,
feeds=None,
links_ext=None,
links_int=None,
startpage_text=None,
):
self.feeds = feeds
self.links_ext = links_ext
self.links_int = links_int
self.startpage_text = startpage_text
self.keywords = self.keywords[:20]
if not self.last_update:
self.last_update = datetime.utcnow()
pub_date: Optional[str]
if self.last_pub:
pub_date = date.isoformat(self.last_pub.date())
self.pub_dates = {date.isoformat(self.last_update): pub_date}
else:
pub_date = None
self.pub_dates = {}
if base_durl:
self.base_urls = [base_durl.url()[:200]]
self.domains = [extract_domain(base_durl.hostname)[:100]]
def __str__(self):
return (
f'Site(id={self.id_}, url={self.base_url},'
f' crawl_enabled={self.crawl_enabled})'
)
async def update_base_url(self) -> None:
"""
Update the base_url, choosing the most relevant URL.
If canonical_url is not None, use this.
Otherwise set self.base_url to the shortest from self.base_urls,
but requiring a https-url if there is at least one.
"""
if self.canonical_url and self.canonical_url not in self.base_urls:
if canonical_durl := await Durl(self.canonical_url):
self.base_urls.append(self.canonical_url)
domain = extract_domain(canonical_durl.hostname)
if domain not in self.domains:
self.domains.append(domain)
if self.canonical_url:
self.base_url = self.canonical_url
return
if not self.base_url:
url_candidates = self.base_urls
if https_urls := [
url for url in self.base_urls if url.startswith('https://')
]:
url_candidates = https_urls
self.base_url = min(url_candidates, key=len)
async def save( # type: ignore
self, conn, merge=True
) -> tuple[Optional[int], bool]:
"""
Store the site, optionally trying to merge it with an existing site.
Return the id of the saved instance and whether a new instance
was created.
If self.id_ is not 0, replace the data of the existing site with
this id. Else if not merge, store as new row, and if merge,
try to merge with an existing matching site.
"""
await self.update_base_url()
if not merge:
created = not bool(self.id_)
await super().save(conn)
return self.id_, created
if self.id_:
sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
row = await conn.fetchrow(sql, self.id_)
self.base_urls = list(
set(row['base_urls']).union(set(self.base_urls))
)
if previous_pub_dates := row['pub_dates']:
if not self.pub_dates:
self.pub_dates = {}
self.pub_dates.update(previous_pub_dates)
await super().save(conn)
return self.id_, False
same_site_id = await search_same_site(self, conn)
if same_site_id:
same_site = await Site().load(conn, same_site_id)
if same_site_id and same_site:
same_site.base_urls = set(same_site.base_urls).union(
set(self.base_urls)
)
same_site.domains = set(same_site.domains).union(set(self.domains))
if self.canonical_url and not same_site.canonical_url:
same_site.canonical_url = self.canonical_url
await same_site.save(conn, merge=False) # call ourselves
self.id_ = same_site.id_
return self.id_, False
else:
await super().save(conn)
return self.id_, True
@dataclass
class SitePath(ModelBase):
"""
Path of a website. May point to a Resource.
"""
table: ClassVar = 'site_path'
site: InitVar[str] = None
site_id: Optional[int] = None
path: Optional[str] = None
filtered: bool = False
last_visit: Optional[datetime] = None
ok_count: int = 0
canonical: Optional[bool] = None
resource_id: Optional[int] = None
def __str__(self):
return (
f'SitePath(id={self.id_}, site_id={self.site_id},'
f' path={self.path})'
)
async def save(self, conn: Connection):
"""
Save the instance, extending the parent's method.
"""
self.path = self.path[:400] if self.path else ''
await super().save(conn)
async def unlink_resource(self, conn, engine, index_base_name):
"""
Unlink the resource and also delete it, if it has no more links.
"""
if self.id_:
if self.resource_id:
sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
ref_count = await conn.fetchval(sql, self.resource_id)
if ref_count == 0:
sql = (
"DELETE FROM resource WHERE id=$1"
" RETURNING (true, lang)"
)
found = await conn.fetchval(sql, self.resource_id)
if found:
await delete_resource(
engine, found[1], self.resource_id
)
self.resource_id = None
def url(self, site):
"""
Return the full URL (combine the site's base_url with our path).
"""
return site.base_url + self.path
@dataclass
class Crawl(ModelBase):
"""
The crawl process of a website (begin, end, statistics, ...).
"""
table: ClassVar = 'crawl'
site_id: Optional[int] = None
is_full: bool = False
t_begin: datetime = datetime.utcnow()
t_end: Optional[datetime] = None
n_resources: int = 0
n_resources_new: int = 0
async def finish(self, conn, set_t_end):
"""
Save the crawl. Set t_end only if indicated.
"""
if set_t_end:
self.t_end = datetime.utcnow()
await self.save(conn)
async def search_same_site(
site: Site,
conn: Connection,
) -> Optional[int]:
"""
Try to find a matching site for the given *site* and return its id.
TODO: if the path is non-trivial, require it also for the matching site
Two sites match when they return the same content for identical paths.
The base_url (scheme and/or netloc) may differ.
We do not have the content for all paths of both websites, so we need
to estimate: We only take into account meta information from the
start pages of both sites, in particular the title, description
and information obtained the base_urls:
We use a combination of these conditions:
1. one of the sites has a canonical URL which matches the
URL of the other site
2. the content fields (title, description) have sufficient information
3. the content fields match exactly
4. the domain matches
5. the domain matches, except for the TLD
6. the base_urls differ in their schemes (http vs. https)
7. the hostnames in the base_urls are identical
8. the hostnames in the base_urls differ by a prepended 'www.'
9. the IPs have at least one common address
The algorithm is this (first answer is final, yes means match):
* if (1) : yes
* if (2), (3), (4) : yes
* if (2), (3), (5), (9) : yes
* if (6), ((7) or (8)) : yes
* no
"""
# rule (1)
if site.canonical_url:
sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
id_ = await conn.fetchval(sql, site.canonical_url)
if id_:
return id_
else:
sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
id_ = await conn.fetchval(sql, site.base_urls)
if id_:
return id_
# rule (6), ((7) or (8))
url_variants = set(
chain.from_iterable(
get_url_variants(base_url) for base_url in site.base_urls
)
)
sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
if id_ := await conn.fetchval(sql, url_variants):
return id_
# condition (2)
if len(site.title or '') > 15 or len(site.description or '') > 15:
sql = (
f"SELECT * FROM site WHERE"
f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
)
rows = await conn.fetch(sql, site.title or '', site.description or '')
# condition (3)
if rows:
# condition (4)
for row in rows:
domains = set(row.get('domains', []))
if domains & set(site.domains):
return row['id']
# condition (9)
for row in rows:
ips = set(row.get('ips', []))
if site.ips and ips & set(site.ips):
# condition (5)
domains_ = row.get('domains', [])
d1 = set([tldextract.extract(d).domain for d in domains_])
domains_ = site.domains or []
d2 = set([tldextract.extract(d).domain for d in domains_])
if d1 & d2:
return row['id']
return None

View file

@ -0,0 +1,22 @@
"""
Filter paths found in a resource.
This plugin implements :func:`rp_filter`.
"""
from typing import Optional
def rp_filter(site, durl) -> Optional[str]:
"""
Adjust or filter found paths (may depend on site).
To filter out a path (i.e., not add it to table `site_path`)
return None.
"""
path = durl.pwa()
# skip fetching images (linked from a tags; img tags are skipped anyway)
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
return None
path = path.removesuffix('?amp=1')
return path

View file

@ -0,0 +1,47 @@
"""
Relevance estimation of sites.
This plugin implements :func:`site_filter`.
"""
import re
from atextcrawler.models import Site
MIN_RELEVANCE_SCORE = 5
async def site_filter(site: Site) -> bool:
"""
Assess relevance of the site (using language-dependent criteria).
If the site shall be crawled, return True, else False.
"""
# limit to sites in English or German language
if not set(['de', 'en']) & set(site.langs):
return False
score = 0.0
for crit_name, weight, langs, crit_re in re_criteria:
if '*' in langs or set(langs) & set(site.langs):
findings = crit_re.findall(site.startpage_text)
if findings:
score += weight * len(findings)
if site.title and crit_re.search(site.title):
score += 4 * weight
if site.description and crit_re.search(site.description):
score += 4 * weight
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
return score >= MIN_RELEVANCE_SCORE
re_criteria = {
(
'anarch',
1.0,
('*',),
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
),
('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
}

View file

@ -0,0 +1,24 @@
"""
Plugin for filtering paths of a site to be retrieved.
This plugin implements :func:`sp_filter`.
"""
def sp_filter(site, path, robots) -> bool:
"""
Per-site path filter. Return whether the path shall be retrieved.
"""
if not robots.can_fetch_url(site.base_url + path):
return False
if 'amusewiki' in site.meta_info.get('generator', '').lower():
if any(
[
path.endswith(end)
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
]
):
return False
if '/bbselect?' in path:
return False
return True

View file

@ -0,0 +1,10 @@
from .dedup import store_boilerplate_texts
from .feed import feed_types, update_feed
from .fetch import ResourceFetcher
from .operations import (
add_site_paths,
get_site_path,
process_site_path,
store_feed_entries,
)
from .sitemap import extract_sitemap_paths, get_sitemap_urls

View file

@ -0,0 +1,96 @@
"""
Dev tool for fetching and displaying a resource.
Has no permanent effects.
"""
import asyncio
import logging
import sys
from collections import defaultdict
from pprint import pformat
import aiohttp
from ..models import Feed, TextResource
from ..resource import ResourceFetcher
from ..utils.annotation import pack_annotations, unpack_annotations
from ..utils.durl import Durl
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
logger_page_debug.setLevel(logging.DEBUG)
def add_tags(text, annotations):
"""
Reconstruct html from text and annotations.
This is very similar to what the client does when displaying
a cached hit.
"""
html = ''
opening_tags = defaultdict(list)
closing_tags = defaultdict(list)
anns_tags = sorted(
annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
)
for (i, f), anns in anns_tags:
opening_tags[i] += [tag for tag in reversed(anns)]
closing_tags[f] += [tag for tag in reversed(anns)]
positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
last_pos = 0
links = {i: href for href, (i, f, rel) in annotations['links'].items()}
for pos in positions:
html += text[last_pos:pos]
closing = closing_tags.get(pos, [])
opening = opening_tags.get(pos, [])
common = set(closing) & set(opening)
closing = [tag for tag in closing if tag not in common]
opening = [tag for tag in opening if tag not in common]
tags_html = ''
for tag in reversed(closing):
html += f'</{tag}>\n'
for tag in opening:
if tag == 'a':
href = links.get(pos, '#')
html += f'<a href="{href}">'
else:
html += f'<{tag}>'
last_pos = pos
return html
async def run():
"""
Fetch and display a resource with URL given as cmdline argument.
"""
url = sys.argv[1]
async with aiohttp.ClientSession() as session:
if not (durl := await Durl(url)):
return
fetcher = ResourceFetcher(session)
resource = await fetcher.fetch(url)
if isinstance(resource, TextResource):
logger.warning(repr(resource))
logger.warning(f'Language: {resource.lang}')
logger.warning(pformat(resource.search_fields))
logger.warning(pformat(resource.init_fields))
# annotations = resource.search_fields.get('annotations')
# text = resource.search_fields['text']
# with open('/tmp/1.html', 'w') as f:
# html = add_tags(text, annotations)
# f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
# f'<body>\n{html}\n</body></html>')
elif isinstance(resource, Feed):
logger.warning(resource.debug())
else:
logger.warning(f'Resource has type {type(resource)}')
logger.warning(resource)
if __name__ == '__main__':
asyncio.run(run())

View file

@ -0,0 +1,59 @@
"""
Find boilerplate texts.
"""
from collections import Counter
from ..models import TextResource
from ..utils.probe import extract_samples
from ..utils.section import iter_sections
async def store_boilerplate_texts(fetcher, conn, site):
"""
Find and store boilerplate texts of a site.
Fetch the start page and internal sample links obtained from it.
If there are sufficienty frequently appearing text sections,
consider them as boilerplate texts.
If boilerplate_texts were found, update the given site instance.
"""
startpage = await fetcher.fetch(site.base_url, site=site)
if (
not isinstance(startpage, TextResource)
or startpage.content_type != 'html'
):
return
# fetch sample resources
sample_links = extract_samples(startpage.init_fields['links_int'])
resources = [startpage]
for sample_link in sample_links:
if sample_link.path == site.base_url: # avoid duplicate resources
continue # NB: duplicate resources may have different paths
sample_resource = await fetcher.fetch(sample_link.url(), site=None)
if (
isinstance(sample_resource, TextResource)
and sample_resource.content_type == 'html'
):
resources.append(sample_resource)
# find common texts in resources
if (n_resources := len(resources)) > 2:
text_freq = Counter()
for resource in resources:
text = resource.search_fields['text']
semantic_breaks = resource.search_fields['annotations'][
'semantic_breaks'
]
for sec in iter_sections(text, semantic_breaks):
text_freq[sec[3]] += 1
boilerplate_texts = []
if min(text_freq.values() or [0]) == 1: # no resource fetched twice
for text, freq in text_freq.items():
if freq > 2:
boilerplate_texts.append(text)
sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
await conn.execute(sql, boilerplate_texts, site.id_)
site.boilerplate_texts = boilerplate_texts

View file

@ -0,0 +1,131 @@
"""
Parse documents (often application/pdf).
"""
import logging
import re
from datetime import datetime
from typing import Optional, Union
from tika import parser
from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.durl import Durl
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from .plaintext import annotate_text
logger = logging.getLogger(__name__)
logger_debug = logging.getLogger(__name__ + '.debug')
logger_debug.setLevel(logging.INFO)
re_url = re.compile(
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
)
async def parse_document(
durl: Durl,
resp: dict,
site: Optional[Site],
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
"""
Extract plain text from documents in various formats.
"""
content = resp['content']
# HTTP headers, canonical URL, shortlink
header_links = await get_header_links(resp['headers'], durl, site)
if canonical := header_links.get('canonical'):
if canonical != durl.url():
return ResourceRedirect(resp['redirects'] + [canonical])
shortlink = header_links.get('shortlink')
# use tika to extract text
doc = parser.from_buffer(content)
# logger.debug(pformat(doc))
if doc.get('status') != 200:
msg = f'Analyzing document failed: {durl.url()}'
return ResourceError(msg)
# collect meta data
meta = doc.get('metadata', {})
content_type = meta.get('Content-Type')
if isinstance(content_type, list):
content_type = content_type[-1]
title = concat(meta.get('title'))
concat(meta.get('creator'))
last_change = extract_latest(meta.get('date') or meta.get('created'))
keywords = None
# text content
text = (doc.get('content') or '').strip()
# links
links_int: dict[Durl, tuple[list[str], str]] = {}
links_ext: dict[Durl, tuple[list[str], str]] = {}
for url in re_url.findall(text):
link_durl = await Durl(url[0])
if link_durl:
if link_durl.site() == durl.site():
links_int[link_durl] = [], link_durl.url()
else:
links_ext[link_durl] = [], link_durl.url()
# annotations
text, annotations = annotate_text(text)
return TextResource(
content_type=content_type,
last_change=last_change,
text_len=len(text),
lang=extract_content_language(text),
title=title,
init_fields={
'durl': durl,
'site': site,
'headers': resp['headers'],
'redirects': resp['redirects'],
'links_int': links_int,
'links_ext': links_ext,
'shortlink': shortlink,
'canonical': None,
},
search_fields={
'title': title,
'pub_date': last_change,
'keywords': keywords,
'text': text,
'annotations': annotations,
},
)
def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
"""
Extract the lastest date (if any) from a string or list of strings.
"""
if not s:
return None
if not isinstance(s, list):
s = [s]
dt = []
for t in s:
try:
dt.append(datetime.fromisoformat(t.rstrip('Z')))
except:
pass
return max(dt) if dt else None
def concat(s: Optional[Union[str, list]]) -> Optional[str]:
"""
Helper function for joining strings together.
"""
if not s:
return None
if not isinstance(s, list):
s = [s]
return ' '.join(s)

View file

@ -0,0 +1,155 @@
"""
Stuff related to feeds.
Higher-level stuff is in site.feeds.
"""
import logging
from datetime import datetime, timezone
from typing import Optional, Union
from asyncpg import Connection
from feedparser import parse
from ..models import Feed, MetaResource, ResourceError
from ..utils.durl import Durl
logger = logging.getLogger(__name__)
feed_types = (
'application/rss+xml',
'application/atom+xml',
'application/feed+json',
)
async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
"""
Fetch, parse and return a given feed's content. Also update *feed*.
If the server replied with HTTP 410, delete the feed.
If there is no new information (server replied with HTTP 304),
return None. For other errors also return None and increase the
fail_count.
"""
headers = {'Cache-control': 'max-age=600'}
if feed.modified:
headers['If-Modified-Since'] = feed.modified
elif feed.etag:
headers['If-None-Match'] = feed.etag.removeprefix('W/')
resource = await fetcher.fetch(feed.url, headers=headers)
if isinstance(resource, ResourceError):
if resource.status == 410:
msg = f'Feed has vanished, deleting it: {feed}'
logger.debug(msg)
await feed.delete(conn)
if resource.status != 304:
feed.fail_count += 1
if feed.fail_count > 5:
msg = f'Feed not reachable, deleting it: {feed}'
logger.debug(msg)
await feed.delete(conn)
return None # HTTP 304, no new entries
elif isinstance(resource, Feed):
resource.id_ = feed.id_
resource.site_id = feed.site_id
await resource.save(conn)
return resource.entries
else:
return None
def parse_json_feed(resp, data: dict) -> Feed:
"""
Parse a JSON response for jsonfeed information.
TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
"""
feed = Feed()
feed.url = data.get('feed_url', resp['redirects'][-1])
feed.etag = resp['headers'].get('ETag')
feed.modified = resp['headers'].get('Last-Modified')
feed.t_visit = datetime.utcnow()
version = data.get('version', '')
version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
feed.version = version[:10]
feed.title = data.get('title')
feed.description = data.get('description')
feed.fail_count = 0
entries = []
latest = None
# parse feed entries to a dict compatible with feedparser's entries
for feed_item in data.get('items', []):
entry = {}
entry['link'] = feed_item.get('url')
dt = feed_item.get('date_published')
if dt:
dt = datetime.fromisoformat(dt) if dt else None
dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
entry['published_parsed'] = dt.timetuple()
entry['title'] = feed_item.get('title')
entry['summary'] = feed_item.get('summary')
entries.append(entry)
if dt:
latest = max(latest or dt, dt)
feed.entries = entries
feed.t_content = latest
return feed
def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
"""
Parse a response from Fetcher.get_resp() for xml feed information.
"""
feed = Feed()
feed.url = resp['redirects'][-1]
feed.etag = resp['headers'].get('ETag')
feed.modified = resp['headers'].get('Last-Modified')
feed.t_visit = datetime.utcnow()
try:
parsed = parse(resp['content'], response_headers=resp['headers'])
except Exception as error:
return ResourceError(f'Feedparser error: {error}')
latest = parsed['feed'].get('updated_parsed')
if latest:
latest = datetime(*latest[:6])
feed.t_content = max(feed.t_content or latest, latest)
feed.version = parsed['version']
feed.title = parsed['feed'].get('title', '')[:200] or None
feed.description = parsed['feed'].get('description')
feed.fail_count = 0
feed.entries = parsed['entries']
return feed
def convert_feed_entries(
base_url: Optional[str],
entries: list[dict],
) -> tuple[
list[tuple[str, bool]],
dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
]:
"""
Extract paths and resource meta information from a feed's entries.
Return paths in a structure wanted by :func:`add_site_paths` and
resource meta information in a structure wanted by
:func:`update_resource_meta`.
"""
paths = []
resource_meta = {}
for entry in entries:
if entry.get('link') and entry['link'].startswith(base_url or ''):
path = entry['link'].removeprefix(base_url or '').lstrip('/')
if len(path) <= 200:
last_update = entry.get('published_parsed')
if last_update:
last_update = datetime(*last_update[:6])
paths.append((path, True))
resource_meta[path] = (
last_update,
entry.get('title', '')[:200] or None,
entry.get('summary', '')[:2000] or None,
)
return paths, resource_meta

View file

@ -0,0 +1,327 @@
"""
Access to a resource specified by a URL.
"""
import gzip
import logging
from json import loads
from traceback import format_exc
from typing import Any, Optional, Union
import aiohttp
from bs4 import BeautifulSoup
from ..models import (
Feed,
MetaResource,
ResourceError,
ResourceRedirect,
Site,
TextResource,
)
from ..utils.durl import Durl
from ..utils.link import in_blacklist
from .document import parse_document
from .feed import parse_json_feed, parse_xml_feed
from .page import parse_html
from .plaintext import parse_plaintext
from .sitemap import parse_sitemap, parse_sitemapindex
logger = logging.getLogger(__name__)
MAX_REDIRECTS = 10
"""
Maximum number of redirects to follow.
"""
default_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
' Gecko/20100101 Firefox/78.0',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
}
"""
Default HTTP client headers, overwriting those of aiohttp.ClientSession.
"""
blacklist_content_types = [
'',
'application/ogg',
]
"""
Blacklist for content-types.
"""
text_content_types = {
'text/html': 'html',
'text/plain': 'plain',
'application/rss+xml': 'feed-rss',
'application/atom+xml': 'feed-atom',
'application/feed+json': 'feed-json',
'application/json': 'json',
'application/xml': 'xml',
'text/xml': 'xml',
}
"""
Map content-types to parsers.
"""
class ResourceFetcher:
"""
Fetch a resource specified by a URL (:meth:`fetch`).
The timeout is the same for all requests.
"""
def __init__(
self,
session: aiohttp.ClientSession,
timeout_sock_connect: Union[int, float] = 8,
timeout_sock_read: Union[int, float] = 30,
):
self.session = session
self.timeout = aiohttp.ClientTimeout(
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
)
async def fetch(
self,
url: str,
site: Optional[Site] = None,
redirect_history: Optional[list[str]] = None,
headers: Optional[dict] = None,
) -> Union[
None, MetaResource, TextResource, ResourceError, ResourceRedirect
]:
"""
Try to fetch a resource and return an instance or error or redirect.
If an error was encountered, return a ResourceError.
If the resource has an irrelevant content type, return None.
Otherwise return a specific content instance.
Argument *redirect_history* contains the redirect history;
if one of the redirects is encountered again, return None.
"""
if redirect_history is None:
redirect_history = []
if not (durl := await Durl(url)):
return ResourceError('Invalid URL')
resp = await self.get_resp(
durl,
redirect_history=redirect_history,
headers=headers,
)
if isinstance(resp, ResourceError):
return resp
if resp is None:
return None
result = await self._parse(durl, site, resp)
if isinstance(result, (MetaResource, TextResource)):
result.id_ = None
return result
async def _parse(
self, durl, site, resp, in_recursion=False
) -> Union[
None, MetaResource, TextResource, ResourceError, ResourceRedirect
]:
"""
Parse a response. May call itself.
"""
result: Union[
None, MetaResource, TextResource, ResourceError, ResourceRedirect
] = None
content = resp['content']
if isinstance(content, str) and content.startswith('<?xml '):
result = await parse_xml(durl, resp)
elif resp['parser'] == 'feed-rss':
result = await parse_xml(durl, resp, rss=True)
elif resp['parser'] == 'feed-atom':
result = await parse_xml(durl, resp, atom=True)
elif resp['parser'] == 'xml':
result = await parse_xml(durl, resp)
elif resp['parser'] == 'html':
result = await parse_html(durl, resp, site)
elif resp['parser'] in ('json', 'feed-json'):
result = await parse_json(durl, resp)
elif resp['parser'] == 'plain':
result = await parse_plaintext(durl, resp, site)
elif resp['parser'] == 'application':
if resp['headers'].get('content-type') == 'application/x-gzip':
if in_recursion:
return None # consider nested gzip an attack
resp['content'] = gzip.decompress(resp['content'])
return await self._parse(durl, site, resp, in_recursion=True)
result = await parse_document(durl, resp, site)
if isinstance(result, ResourceRedirect):
redir_url = result.urls[-1]
result = await self.fetch(
redir_url,
site=site,
redirect_history=result.urls[:-1],
)
return result
async def get_resp(
self,
durl: Durl,
headers: dict = None,
redirect_history: Optional[list[str]] = None,
) -> Optional[Union[ResourceError, dict]]:
"""
Try to fetch a url returning a ResourceError or a dict with content.
Optional *headers* will overwrite the :var:`default_headers`.
If the response status is not 200, always return an ResourceError.
If the content-type is not relevant (see blacklist_content_types),
return None.
The dict contains these keys+values:
* 'parser': a hint on the parser to use for analyzing the content;
one of 'html', 'plain', 'feed', 'xml', 'application'
* 'content': bytes for type application, otherwise str
* 'redirects': a list of URLs visited during HTTP redirection,
the last item is the final URL
* 'headers': response headers
"""
if redirect_history is None:
redirect_history = []
if len(redirect_history) >= MAX_REDIRECTS:
return None
headers_ = default_headers.copy()
if headers:
headers_.update(headers)
try:
async with self.session.get(
durl.url(),
headers=headers_,
timeout=self.timeout,
) as resp:
redirects = [durl.url()]
if resp.history:
href = resp.history[-1].headers.get('location')
if not href or not (redurl := await Durl(href, base=durl)):
msg = 'Invalid URL after HTTP redirect'
return ResourceError(msg)
if in_blacklist(redurl.hostname):
src_url = (
redirect_history[0]
if redirect_history
else durl.url()
)
msg = (
f'Dropping URL {src_url}, since'
f' redirected to a blacklisted site'
)
logger.debug(msg)
return None
redirects = [str(r.url) for r in resp.history]
redirects.append(redurl.url())
if join := set(redirect_history) & set(redirects):
msg = f'Cyclic redirect {join}'
return ResourceError(msg)
if resp.status != 200:
msg = f'HTTP status {resp.status}'
return ResourceError(
msg, status=resp.status, headers=headers
)
c_type = resp.headers.get('content-type', '').split(';')[0]
if c_type in blacklist_content_types:
return None
result: dict[str, Any] = {
'redirects': redirect_history + redirects,
'headers': resp.headers,
}
if c_type in text_content_types.keys():
try: # catch decoding issues
content = await resp.text()
except:
body = await resp.read()
encoding = resp.charset or 'utf-8'
encoding = encoding.replace('CP-1250', 'cp1250')
content = body.decode(encoding, errors='replace')
result['content'] = content
result['parser'] = text_content_types[c_type]
return result
elif c_type.startswith('application/'):
result['content'] = await resp.read()
result['parser'] = 'application'
return result
except aiohttp.ClientError as error:
# on certificate error try without tls
if 'SSLCertVerificationError' in str(error):
if durl.scheme == 'https':
url = durl.url()
durl.replace_scheme('http')
response = await self.get_resp(
durl=durl,
headers=headers,
redirect_history=redirect_history + [url],
)
if not isinstance(response, ResourceError):
return response
msg = f'ClientError: {error}'
return ResourceError(msg)
except Exception as error:
msg = f'Unknown error: {error}:\n{format_exc()}'
logger.error(msg)
return ResourceError(msg)
return None
async def parse_xml(
durl: Durl,
response: dict,
rss=False,
atom=False,
) -> Optional[Union[MetaResource, ResourceError]]:
"""
Parse XML content.
In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
"""
try:
xml = response['content']
soup = BeautifulSoup(xml, 'html.parser')
except:
return None
if rss or (rss := soup.find('rss')):
return parse_xml_feed(response)
elif atom or (atom := soup.find('atom')):
return parse_xml_feed(response)
elif sitemapindex := soup.find('sitemapindex'):
return parse_sitemapindex(sitemapindex)
elif urlset := soup.find('urlset'):
return parse_sitemap(urlset)
else:
return None
async def parse_json(
durl: Durl,
response: dict,
) -> Optional[Union[Feed, ResourceError]]:
"""
Parse the content of JSON feeds.
"""
try:
data = loads(response['content'])
except:
msg = f'Could not parse JSON from {durl.url()}'
logger.debug(msg)
return None
if not isinstance(data, dict):
return None
if data.get('version', '').startswith('https://jsonfeed.org/'):
return parse_json_feed(response, data)
return None

View file

@ -0,0 +1,347 @@
"""
Operations on resources.
"""
import logging
from datetime import datetime
from typing import Optional, Sequence
from asyncpg import Connection
from ..models import (
Feed,
MetaResource,
ResourceError,
Site,
Sitemap,
SitemapIndex,
SitePath,
TextResource,
)
from ..search import delete_resource, index_resource
from ..tensorflow import TensorFlow
from ..utils.durl import Durl
from ..utils.similarity import (
create_simhash,
search_simhash,
simhash_from_bigint,
simhash_to_bigint,
)
from .feed import convert_feed_entries
from .fetch import ResourceFetcher
from .sitemap import extract_sitemap_paths
logger = logging.getLogger(__name__)
async def add_site_paths(
conn: Connection,
site_id: int,
paths: Sequence[tuple[str, Optional[bool]]],
) -> None:
"""
Add site paths. if resource infos are given, also create resources.
The paths must be given as relative paths and together with a boolean
telling whether the link is a canonical link.
"""
sql = (
"INSERT INTO site_path (site_id, path, canonical)"
" VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING"
)
values = (
(site_id, path, canonical)
for path, canonical in paths[:100000]
if len(path) <= 400
)
await conn.executemany(sql, values)
async def update_resource_meta(
conn: Connection,
site_id: int,
resource_meta: dict,
) -> None:
"""
Update meta information of existing resources using path to find them.
"""
sql = (
"UPDATE resource SET last_change=coalesce($1, last_change),"
" title=coalesce($2, title), summary=coalesce($3, summary) FROM ("
" SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5"
") sp WHERE resource.id=sp.resource_id"
)
values = ((*meta, site_id, path) for path, meta in resource_meta.items())
await conn.executemany(sql, values)
async def store_feed_entries(
conn: Connection,
site: Site,
entries: list[dict],
) -> None:
"""
Add missing resources of a site from given feed entries.
"""
if site.id_:
paths, resource_meta = convert_feed_entries(site.base_url, entries)
await add_site_paths(conn, site.id_, paths)
await update_resource_meta(conn, site.id_, resource_meta)
async def get_site_path(
conn: Connection,
site: Site,
before: datetime,
only_new=False,
) -> Optional[SitePath]:
"""
Return the next path of a given site that needs to be processed.
If none needs to be processed, return None.
Only return paths that have last been visited before *before*
or not been processed at all. Paths with a ok_count of -3 or lower
are dropped.
If *only_new*, limit to paths that have not been processed at all,
irrespective of the value of *before*.
"""
if only_new:
sql = (
"SELECT * FROM site_path"
" WHERE site_id=$1 AND last_visit is null LIMIT 1"
) # implicitly canonical=null
row = await conn.fetchrow(sql, site.id_)
else:
sql = (
"SELECT * FROM site_path"
" WHERE site_id=$1 AND canonical IS NOT false AND"
" (last_visit is null OR last_visit<$2) AND"
" ok_count > -3 LIMIT 1"
) # canonical can be true or null
row = await conn.fetchrow(sql, site.id_, before)
if row:
return await SitePath().load_from_row(row)
return None
async def process_site_path(
app,
worker_number: int,
conn: Connection,
fetcher: ResourceFetcher,
tf: TensorFlow,
site: Site,
site_path: SitePath,
) -> bool:
"""
Fetch a path, deduplicate and if canonical, update and index the resource.
Return whether a new resource was handled that should contribute be
statistics.
"""
msg = (
f'Worker {worker_number} processing site {site.id_}'
f' site_path {site_path.id_} {site.base_url}{site_path.path}'
)
logger.debug(msg)
if not site.id_: # only to satisfy typing
return False
# fetch url
site_path.last_visit = datetime.utcnow()
url = site_path.url(site)
resource = await fetcher.fetch(url, site=site)
# handle failure (possibly deleting old information)
if not isinstance(resource, (TextResource, MetaResource)):
if not resource: # irrelevant content-type
site_path.ok_count = -10
elif isinstance(resource, ResourceError):
site_path.ok_count -= 1
if site_path.ok_count <= -3 and site_path.resource_id:
await site_path.unlink_resource(
conn,
app.search_engine,
app.config['elasticsearch']['index_base_name'],
)
await site_path.save(conn)
if resource: # relevant content-type
msg = (
f'Worker {worker_number} failed to process site_path'
f' {site_path.id_} (site {site.id_},'
f' {site.base_url}{site_path.path})'
)
logger.info(msg)
return False
# handle MetaResources
if isinstance(resource, MetaResource):
if isinstance(resource, Feed):
resource.site_id = site.id_
await resource.save(conn)
if resource.entries:
await store_feed_entries(conn, site, resource.entries)
elif isinstance(resource, Sitemap):
paths, _ = extract_sitemap_paths(site.base_url, resource.urls)
await add_site_paths(conn, site.id_, paths)
elif isinstance(resource, SitemapIndex):
for sitemap_dict in resource.sitemaps:
url = sitemap_dict['loc']
res_sitemap = await fetcher.fetch(url, site=site)
if isinstance(res_sitemap, Sitemap):
paths, _ = extract_sitemap_paths(
site.base_url, res_sitemap.urls
)
await add_site_paths(conn, site.id_, paths)
return False
# handle TextResource
relevant, is_new_resource = await _handle_text_resource(
app, conn, tf, site, site_path, resource, url
)
if not relevant:
return False
site_path.resource_id = resource.id_
site_path.canonical = resource.init_fields.get('canonical')
site_path.ok_count += 1
await site_path.save(conn)
if shortlink_url := resource.init_fields.get('shortlink'):
await _save_shortlink(
conn, site, url, resource, shortlink_url, site_path.last_visit
)
return is_new_resource
async def _handle_text_resource(
app, conn, tf, site, site_path, resource, url
) -> tuple[bool, bool]:
"""
Ingest a text resource.
Return whether the resource is relevant and whether it is new.
"""
# save the resource's internal links
paths = []
if links_int := resource.init_fields['links_int']:
for durl, (rel, _) in links_int.items():
rp_filter = app.plugins['filter_resource_path'].rp_filter
if path := rp_filter(site, durl):
canon = (rel and rel.lower() == 'canonical') or None
paths.append((path, canon))
await add_site_paths(conn, site.id_, paths)
# find resources similar to the current text
text = resource.search_fields['text']
if len(text) < 300: # discard resources with too short texts
site_path.resource_id = None
await site_path.save(conn)
return False, False
simhash = simhash_from_bigint(resource.simhash)
index = site.simhash_index
similar_ids = search_simhash(index, simhash)
# determine the destination resource and resources to be merged into it
old_id = site_path.resource_id
if (
old_id
and old_id in similar_ids
and ( # similar to old text
dest_resource := await TextResource().load(conn, old_id)
)
):
merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
else: # no old text, or old text not similar any more
if old_id:
await site_path.unlink_resource(
conn,
app.search_engine,
app.config['elasticsearch']['index_base_name'],
)
# find the first existing similar resource
for similar_id in similar_ids:
dest_resource = await TextResource().load(conn, similar_id)
if dest_resource:
# also require similar length
l1 = len(resource.search_fields['text'])
l2 = dest_resource.text_len
if 0.95 * l2 <= l1 <= 1.05 * l2:
merge_ids = list(
filter(lambda elem: elem != similar_id, similar_ids)
)
break
else:
dest_resource = None
merge_ids = []
# update or create the destination resource
if dest_resource:
is_new_resource = False
resource.simhash = create_simhash(index, dest_resource.id_, simhash)
await dest_resource.update_from_resource(resource)
resource = dest_resource
else:
is_new_resource = True
resource.simhash = simhash_to_bigint(simhash)
await resource.save(conn)
create_simhash(index, resource.id_, simhash)
# add resource to search index
if resource.content_type in ('html', 'plain'):
await index_resource(
app.search_engine,
tf,
site_path,
resource,
site.base_url,
url,
)
# merge resources: merge_ids -> resource
for merge_id in merge_ids:
# replace links to the merge resource with links to the dest resource
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
await conn.execute(sql, resource.id_ or None, merge_id)
# remove orphaned merge resource
sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
found = await conn.fetchval(sql, merge_id)
if found:
await delete_resource(
app.search_engine,
found[1],
merge_id,
)
return True, is_new_resource
async def _save_shortlink(
conn, site, url, resource, shortlink_url, last_visit
):
"""
Save a shortlink.
"""
shortlink_durl = await Durl(shortlink_url, base=site.base_url)
if shortlink_durl and shortlink_url != url:
sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
sl_path = shortlink_durl.pwa()
row = await conn.fetchrow(sql, site.id_, sl_path)
shortlink = await SitePath().load_from_row(row)
if not shortlink:
shortlink = SitePath(
site_id=site.id_,
path=sl_path,
last_visit=last_visit,
ok_count=1,
canonical=False,
resource_id=resource.id_,
)
else:
shortlink.last_visit = last_visit
shortlink.ok_count += 1
shortlink.canonical = False
shortlink.resource_id = resource.id_
await shortlink.save(conn)

View file

@ -0,0 +1,355 @@
"""
Parse HTML pages.
"""
import logging
from copy import deepcopy
from typing import Optional, Union
from bs4 import BeautifulSoup
from tidylib import tidy_document
from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.annotation import (
annotate,
annotations_remove_section,
clean_annotations,
get_tag_counts,
headline_probability,
)
from ..utils.date_finder import extract_latest_date
from ..utils.durl import Durl, assort_links
from ..utils.html import (
clean_body,
clean_page,
extract_title,
get_html_lang,
get_html_redirect,
)
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from ..utils.section import iter_sections
from ..utils.tag import keep_tags
logger = logging.getLogger(__name__)
logger_debug = logging.getLogger(__name__ + '.debug')
logger_debug.setLevel(logging.INFO)
logger_links = logging.getLogger(__name__ + '.debug.links')
logger_stats = logging.getLogger(__name__ + '.debug.stats')
logger_sections = logging.getLogger(__name__ + '.debug.sections')
async def parse_html(
durl: Durl,
resp: dict,
site: Optional[Site],
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
"""
Extract relevant data from a response returning a TextResource instance.
The given URL must be the full URL (incl. scheme and netloc) of the page.
"""
html = resp['content']
# follow link to canonical URL
header_links = await get_header_links(resp['headers'], durl, site)
if canonical := header_links.get('canonical'):
if canonical != durl.url():
return ResourceRedirect(resp['redirects'] + [canonical])
# follow html redirect, if present
if redir_url := get_html_redirect(html):
if redir_url not in resp['redirects']:
return ResourceRedirect(resp['redirects'] + [redir_url])
else:
msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
return ResourceError(msg)
# require html tag
if not html[:14].lower().startswith('<!doctype html'):
if '<html' not in html:
return None
# real URL after redirection
url = resp['redirects'][-1]
durl = await Durl(url)
if not durl:
return None
# page title
title = extract_title(html)
# tidy html
try:
html, _ = tidy_document(
html.encode('utf-8'),
options={
'logical-emphasis': 1,
'merge-divs': 1,
'merge-spans': 1,
'hide-comments': 1,
'output-bom': 0,
'show-errors': 0,
},
)
html = html.decode('utf-8')
except:
msg = f'Cannot tidy html from {url}'
return ResourceError(msg)
# drop irrelevant tags, including their contents
soup = clean_page(html)
# extract shortlink (from http headers or html head)
shortlink = header_links.get('shortlink')
if not shortlink and soup.head:
for link in soup.head.find_all('link'):
if 'shortlink' in link.get('rel', ''):
if link.get('href'):
shortlink = link.get('href')
break
# language, plaintext, annotations, last change
lang = get_html_lang(html)
html = clean_body(str(soup.body))
head = soup.head
text, annotations = annotate(html)
if lng := extract_content_language(text):
lang = lng
last_change = extract_latest_date(html, lang=lang)
# assort internal and external links
base_url = None
if head and head.base:
base_url = head.base.get('href')
if not base_url and site:
base_url = site.base_url
cleaned_links, links_int, links_ext = await assort_links(
annotations['links'], durl, text, base_url
)
annotations['links'] = cleaned_links
if logger_links.isEnabledFor(logging.DEBUG):
logger_links.debug('==== internal links')
for durl_, txt in links_int.items():
logger_links.debug(f'{durl_.url()} {txt}')
logger_links.debug('==== external links')
for durl_, txt in links_ext.items():
logger_links.debug(f'{durl_.url()} {txt}')
# keywords from category links
category_links = set()
for href, (i, f, rel) in annotations['links'].items():
if rel and ('category' in rel or 'tag' in rel):
category_links.add(text[i:f])
keywords = sorted(category_links)
# filter out irrelevant sections
filtered_text, filtered_ann = filter_sections(
text, annotations, site.boilerplate_texts if site else None
)
# debug statistics
if logger_stats.isEnabledFor(logging.DEBUG):
sb = annotations['semantic_breaks']
fsb = filtered_ann['semantic_breaks']
logger_stats.debug(
f'Page statistics:'
f' html_len={len(html)} text_len={len(filtered_text)}'
f' ratio={len(filtered_text) / len(html):.2f};'
f' sections={len(sb)} filtered_sections={len(fsb)}'
f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
)
return TextResource(
content_type='html',
last_change=last_change,
text_len=len(text),
lang=lang,
title=title,
init_fields={
'durl': durl,
'site': site,
'headers': resp['headers'],
'redirects': resp['redirects'],
'links_int': links_int,
'links_ext': links_ext,
'shortlink': shortlink,
'canonical': True if canonical else None,
'head': head,
},
search_fields={
'title': title,
'pub_date': last_change,
'keywords': keywords,
'text': filtered_text,
'annotations': filtered_ann,
'head': str(head),
},
)
def filter_sections(text, annotations, boilerplate_texts):
"""
Filter out irrelevant sections using scores and factoring in neighbors.
"""
tags = annotations['tags']
sb = annotations['semantic_breaks']
section_ids = annotations['section_ids']
# for i1,f1 in sorted(tags.keys()):
# print(' ', i1,f1,tags[(i1,f1)], text[i1:f1])
# for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
# print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
# print('_' * 50)
# from pprint import pprint
# pprint(sb)
# pprint(tags)
# pprint(section_ids)
# calculate keep scores for sections
# negative scores mean: drop; positive scores mean keep;
# scores between -2 and 2 are undecided
sections_keep = {}
headline_probs = {}
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
if prob := headline_probability(txt, tags[(i, f)], lvl):
headline_probs[(i, f)] = prob
w = 0
n_chars = f - i - 1
# string length
w = (n_chars - 80) / 80 # initial weight
# punctuation
w += 0.4 * text.count('.') + 0.1 * text.count(',')
# p tag
if 'p' in tags[(i + 1, f)]: # prefer keeping paragraphs
w += 0.7
# links
n_links, link_density, avg_text_len = get_tag_counts(
('a',), i, f, tags, text
)
if link_density > 0.5:
w = -n_links
elif link_density > 0.3 and avg_text_len < 60:
w = -3
else:
n_li, li_density, li_len = get_tag_counts(
('li',), i, f, tags, text
)
if link_density > 0.2 and li_density > 0.8 and li_len < 50:
w = -3
if 52 <= lvl < 60:
w = max(w, 1.0)
if 'sidebar' in ' '.join(section_ids.get(i, [])):
w = -3
if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
w = -3
# special chars
if txt.startswith('') or txt.endswith(''): # wordpress navigation
w = -3
# remove boilerplate texts
if boilerplate_texts and txt in boilerplate_texts:
w = -10
sections_keep[(i, f)] = w, lvl
# amend keep scores: look at preceding / subsequent sections with
# equal level and transfer their keep scores to the current section
n = len(sections_keep)
sections = list(sorted(sections_keep.keys()))
# inspect subsequent sections:
for rev_ind, s_range in enumerate(reversed(sections)):
ind = n - 1 - rev_ind
w, lvl = sections_keep[s_range]
if abs(w) <= 2:
w_sum = 0
n_peers = 0
for i in range(ind + 1, min(n, ind + 15)):
w_, lvl_ = sections_keep[sections[i]]
if lvl_ != lvl:
break
n_peers += 1
w_sum += w_
if n_peers >= 3:
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
# inspect preceding sections:
for ind, s_range in enumerate(sections):
w, lvl = sections_keep[s_range]
if abs(w) <= 2:
w_sum = 0
n_peers = 0
for i in range(ind - 1, max(0, ind - 15), -1):
w_, lvl_ = sections_keep[sections[i]]
if lvl_ != lvl:
break
n_peers += 1
w_sum += w_
if n_peers >= 3:
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
# amend keep scores: look at sections that could be headlines
# for subsequent kept sections and increase their score;
# also allow for up to 2 sections inbetween (which will also
# have their score increased)
for rev_ind, s_range in enumerate(reversed(sections)):
ind = n - 1 - rev_ind
w, lvl = sections_keep[s_range]
if abs(w) <= 2:
if headline_probs.get(s_range, 0) > 0.49:
# look at subsequent sections with higher level
child_weights = []
for i in range(ind + 1, n):
w_, lvl_ = sections_keep[sections[i]]
if lvl_ <= lvl or w_ < -2:
break
child_weights.append(w_)
if nc := len(child_weights):
child_avg = sum(child_weights) / nc
if w + 1.2 * child_avg > 2:
sections_keep[s_range] = w + 1.2 * child_avg, lvl
if nc > 1:
if (w1 := child_weights[0]) <= 2:
sections_keep[sections[ind + 1]] = (
w1 + 1.5 * child_avg,
lvl,
)
if nc > 2:
if (w2 := child_weights[1]) <= 2:
sections_keep[sections[ind + 2]] = (
w2 + 2 * child_avg,
lvl,
)
# clean annotations
clean_annotations(annotations)
# debug sections
if logger_sections.isEnabledFor(logging.DEBUG):
logger_sections.debug('============= Weighted sections =============')
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
w, lvl = sections_keep[(i, f)]
indent = ('+' if w > 2 else '-') * lvl
ts = ','.join(tags[(i + 1, f)])
logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
# narrow down annotations and text to keep_sections
# drop undecided sections
filtered_text = text
filtered_ann = deepcopy(annotations)
for i, f in sorted(sections_keep.keys(), reverse=True):
w, lvl = sections_keep[(i, f)]
if w <= 2.0:
filtered_ann = annotations_remove_section(filtered_ann, i, f)
filtered_text = filtered_text[:i] + filtered_text[f:]
clean_annotations(filtered_ann)
# debug filtered sections
if logger_sections.isEnabledFor(logging.DEBUG):
logger_sections.debug('')
logger_sections.debug('============= Filtered sections =============')
fsb = filtered_ann['semantic_breaks']
ftags = filtered_ann['tags']
for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
indent = ' ' * lvl
ts = ','.join(ftags.get((i + 1, f), []))
logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
return filtered_text, filtered_ann

View file

@ -0,0 +1,148 @@
"""
Parse plaintext pages.
"""
import logging
import re
from typing import Any, Optional, Union
import pypandoc
from ..models import ResourceError, ResourceRedirect, Site, TextResource
from ..utils.annotation import annotate
from ..utils.date_finder import extract_latest_date
from ..utils.durl import Durl
from ..utils.http import get_header_links
from ..utils.lang import extract_content_language
from ..utils.muse import parse_muse
logger = logging.getLogger(__name__)
MAX_LINK_TEXT_LENGTH = 100
"""
Maximum length of a link's text to be kept.
Cf. table site_link, column link_text.
"""
re_url = re.compile(
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
)
re_nl = re.compile(r'\r\n')
re_ws = re.compile(r'\s*\n\s*\n\s*')
re_nn = re.compile(r'\n\n')
async def parse_plaintext(
durl: Durl,
resp: dict,
site: Optional[Site],
) -> Optional[Union[ResourceRedirect, TextResource]]:
"""
Extract relevant data from a response returning a TextResource instance.
The given URL must be the full URL (incl. scheme and netloc) of the page.
"""
text = resp['content']
# HTTP headers, canonical URL, shortlink
header_links = await get_header_links(resp['headers'], durl, site)
if canonical := header_links.get('canonical'):
if canonical != durl.url():
return ResourceRedirect(resp['redirects'] + [canonical])
shortlink = header_links.get('shortlink')
if not text:
return None
text = re_nl.sub('\n', text)
text = re_ws.sub('\n\n', text)
# meta info
meta: dict[str, Any] = {}
muse = None
if durl.path.endswith('.muse'):
muse = parse_muse(text)
if muse:
meta, text = muse
# title
if not meta.get('title'):
meta['title'] = text[:200].splitlines()[0]
# content language
if not meta.get('lang'):
meta['lang'] = extract_content_language(text)
# publication date
if not meta.get('pub_date'):
meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
# links
links_int: dict[Durl, tuple[list[str], str]] = {}
links_ext: dict[Durl, tuple[list[str], str]] = {}
for url in re_url.findall(text):
link_durl = await Durl(url[0])
if link_durl:
if link_durl.site() == durl.site():
links_int[link_durl] = [], link_durl.url()
else:
links_ext[link_durl] = [], link_durl.url()
if muse:
html = pypandoc.convert_text(text, 'html5', format='muse').strip()
text, annotations = annotate(html)
else:
text, annotations = annotate_text(text)
return TextResource(
content_type=resp['parser'],
last_change=meta.get('pub_date'),
text_len=len(text),
lang=meta.get('lang'),
title=meta.get('title'),
init_fields={
'durl': durl,
'site': site,
'headers': resp['headers'],
'redirects': resp['redirects'],
'links_int': links_int,
'links_ext': links_ext,
'shortlink': shortlink,
'canonical': None,
},
search_fields={
'title': meta.get('title'),
'authors': meta.get('authors'),
'pub_date': meta.get('pub_date'),
'keywords': meta.get('keywords'),
'summary': meta.get('summary'),
'text': text,
'annotations': annotations,
},
)
def annotate_text(text):
"""
Return annoations as :func:`utils.annotation.annotate`does.
Here we only have information on semantic breaks
(in plaintext they are where empty lines are).
"""
semantic_breaks = {}
for match in re_nn.finditer(text):
semantic_breaks[match.span()[0]] = ''
annotations = {
'tags': {},
'semantic_breaks': semantic_breaks,
'section_ids': {},
'links': {},
}
return text, annotations

View file

@ -0,0 +1,149 @@
"""
Sitemap and SitemapIndex and related operations.
"""
import logging
from datetime import datetime
from typing import Optional
import pytz
from ..models import Sitemap, SitemapIndex, TextResource
logger = logging.getLogger(__name__)
async def get_sitemap_urls(
fetcher,
base_url: Optional[str],
sitemaps=None,
) -> list[dict]:
"""
Try to find sitemaps and fetch and return their URL content.
Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
"""
if sitemaps:
# test example: https://www.berlin.de/
check_all = True
elif base_url:
sitemaps = [
base_url.rstrip('/') + '/sitemap.xml',
base_url.rstrip('/') + '/wp-sitemap.xml',
base_url.rstrip('/') + '/sitemap_index.xml',
base_url.rstrip('/') + '/sitemap.xml.gz',
base_url.rstrip('/') + '/sitemap_index.xml.gz',
base_url.rstrip('/') + '/sitemap.txt',
base_url.rstrip('/') + '/sitemap/',
base_url.rstrip('/') + '/sitemap1.xml',
base_url.rstrip('/') + '/sitemap-index.xml',
base_url.rstrip('/') + '/sitemapindex.xml',
base_url.rstrip('/') + '/sitemap/index.xml',
]
check_all = False
else:
return []
urls = []
for sitemap in sitemaps:
resource = await fetcher.fetch(sitemap)
found = True
if isinstance(resource, SitemapIndex):
for sitemap_ in resource.sitemaps:
sitemaps.append(sitemap_['loc'])
elif isinstance(resource, Sitemap):
urls += resource.urls
elif isinstance(resource, TextResource) and resource.content_type in (
'html',
'plain',
):
urls += [
{'loc': durl.url()}
for durl in resource.init_fields['links_int']
]
else:
found = False
if found and not check_all:
break
return urls
def parse_sitemapindex(sitemapindex):
"""
Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
"""
sitemaps = []
for tag in sitemapindex.find_all('sitemap'):
if loc := tag.find('loc'):
if loc.string:
sitemap = {'loc': loc.string.strip()}
if lastmod := tag.find('lastmod'):
try:
t = datetime.fromisoformat(lastmod.string.strip())
sitemap['lastmod'] = t
except:
pass
sitemaps.append(sitemap)
return SitemapIndex(sitemaps=sitemaps)
def parse_sitemap(urlset) -> Sitemap:
"""
Return a list of sitemap URLs.
Each URL is a dict with these keys+values:
* loc: the full URL of a mapped resource
* lastmod: optional datetime of its last modification
* changefreq: optional info on the change frequency to be expected
* priority: optional info on its priority relative to other resources
Cf. https://www.sitemaps.org/protocol.html
"""
urls = []
for tag in urlset.find_all('url'):
if loc := tag.find('loc'):
if loc.string:
url = {'loc': loc.string.strip()}
if lastmod := tag.find('lastmod'):
try:
t = lastmod.string.strip().rstrip('Z')
url['lastmod'] = (
datetime.fromisoformat(t)
.astimezone(pytz.utc)
.replace(tzinfo=None)
)
except:
pass
if changefreq := tag.find('changefreq'):
url['changefreq'] = changefreq.string.strip()
if priority := tag.find('priority'):
url['priority'] = priority.string.strip()
urls.append(url)
return Sitemap(urls=urls)
def extract_sitemap_paths(
base_url: Optional[str],
urls: list[dict],
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
"""
Extract essential information from sitemap URLs.
Return a list of relative paths of the site's resources
(in a form to be easily fed into `add_site_paths`) and
the datetime of the latest change.
Relative paths are computed using base_url.
"""
paths = []
latest = None
for url in urls:
loc = url['loc']
lastmod = url.get('lastmod')
if loc.startswith(base_url or ''):
path = loc.removeprefix(base_url or '').lstrip('/')
path = path.split('#', 1)[0]
paths.append((path, True))
if lastmod:
latest = max(lastmod, latest or lastmod)
return paths, latest

View file

@ -0,0 +1,6 @@
from .engine import (
delete_resource,
index_resource,
shutdown_engine,
startup_engine,
)

View file

@ -0,0 +1,270 @@
"""
Search engine, for now elasticsearch.
We have one index per supported language and a default one.
"""
import logging
import warnings
from difflib import SequenceMatcher
from typing import Union
from elasticsearch import AsyncElasticsearch
from elasticsearch.exceptions import NotFoundError
from ..utils.annotation import pack_annotations
from ..utils.section import concat_section_texts
logger = logging.getLogger(__name__)
warnings.filterwarnings(
'ignore',
'The client is unable to verify that the'
' server is Elasticsearch due security privileges on the server side',
)
MIN_INDEXING_TIMEOUT_SECONDS = 5
language_analyzers = {
'en': 'english',
'de': 'german',
#'fr': 'french',
#'el': 'greek',
#'es': 'spanish',
'default': 'standard',
}
properties = {
'resource_id': {'type': 'long'},
'site_id': {'type': 'long'},
'url': {'type': 'text'},
'base_url': {'type': 'text'},
'pub_date': {'type': 'date', 'format': 'yyyy-MM-dd||yyyy-MM||yyyy'},
'lang': {'type': 'keyword'},
'title': {'type': 'text'},
'authors': {'type': 'text'},
'summary': {'type': 'text'},
'keywords': {'type': 'text'},
'collections': {'type': 'keyword'},
'time_horizon': {'type': 'keyword'},
'orig_source': {'type': 'text'},
'topics': {'type': 'text'},
'annotations': {'type': 'text', 'index': False},
'sections': {
'type': 'nested',
'properties': {
'start_ids': {'type': 'integer'},
'end_ids': {'type': 'integer'},
'text': {'type': 'text', 'index_options': 'offsets'},
'embedding': {'type': 'dense_vector', 'dims': 512},
},
},
}
async def startup_engine(config):
"""
Open the search engine for access.
"""
engine = AsyncElasticsearch(
host=config['elasticsearch']['host'],
api_key=(
config['elasticsearch']['id'],
config['elasticsearch']['api_key'],
),
use_ssl=False,
timeout=20,
)
engine.index_base_name = config['elasticsearch']['index_base_name']
await create_indices(engine)
await open_indices(engine)
return engine
async def create_indices(engine):
"""
Create indices for all configured langiages.
"""
for lang, analyzer in language_analyzers.items():
index_name = engine.index_base_name + '_text_' + lang
if not await engine.indices.exists(index=index_name):
await engine.indices.create(index=index_name)
await engine.indices.close(index=index_name)
await engine.indices.put_settings(
index=index_name,
body={
'analysis': {'analyzer': {'default': {'type': analyzer}}},
'refresh_interval': '60s',
},
)
await engine.indices.put_mapping(
index=index_name,
body={'properties': properties},
)
async def open_indices(engine):
"""
Open indices for all configure languages.
"""
for lang in language_analyzers.keys():
index_name = engine.index_base_name + '_text_' + lang
await engine.indices.open(index=index_name)
async def shutdown_engine(engine):
"""
Close the connection to the search engine.
"""
# await close_indices(engine)
await engine.close()
async def close_indices(engine):
"""
Close indices. UNUSED.
"""
for lang in language_analyzers.keys():
index_name = engine.index_base_name + '_text_' + lang
await engine.indices.close(index=index_name)
async def index_resource(
engine,
tf,
site_path,
resource,
base_url,
url,
):
"""
Index a resource.
"""
lang = resource.lang
index_lang = lang if lang in language_analyzers.keys() else 'default'
index_name = engine.index_base_name + '_text_' + index_lang
pub_date = resource.search_fields.get('pub_date')
if pub_date:
pub_date = str(pub_date.date())
text = resource.search_fields.get('text')
annotations = resource.search_fields.get('annotations')
semantic_breaks = annotations['semantic_breaks']
sections = []
for section_ids, txt in concat_section_texts(text, semantic_breaks):
embedding = await tf.embed(txt)
sections.append(
{
'start_ids': section_ids[0],
'end_ids': section_ids[-1],
'text': txt,
'embedding': embedding,
}
)
doc = {
'resource_id': resource.id_,
'site_id': site_path.site_id,
'url': url,
'base_url': base_url,
'pub_date': pub_date,
'lang': resource.lang,
'title': resource.search_fields.get('title'),
'authors': resource.search_fields.get('authors'),
'summary': resource.search_fields.get('summary'),
'keywords': resource.search_fields.get('keywords'),
'collections': resource.search_fields.get('collections'),
'time_horizon': resource.search_fields.get('time_horizon'),
'orig_source': resource.search_fields.get('orig_source'),
'topics': resource.search_fields.get('topics'),
'annotations': pack_annotations(annotations),
'sections': sections,
}
timeout_seconds = max(MIN_INDEXING_TIMEOUT_SECONDS, int(len(text) / 1000))
await engine.index(
id=resource.id_,
index=index_name,
body=doc,
timeout=f'{timeout_seconds}s',
)
async def delete_resource(engine, lang, resource_id):
"""
Delete a resource.
"""
index_name = engine.index_base_name + '_text_' + (lang or 'default')
try:
await engine.delete(index_name, resource_id)
except NotFoundError:
msg = f'Cannot delete resource from index, not found: {resource_id}'
logger.warning(msg)
async def find_duplicate(engine, site_id, resource) -> Union[bool, None, int]:
"""
UNUSED.
Try to find a duplicate resource with matching site.
If the search backend query fails, return False.
If no matching resource was found, return None.
If a matching resource was found, return its id.
"""
# get sample texts
text = resource.search_fields['text']
if not text or len(text) < 100:
return None
# annotations = resource.search_fields['annotations']
# semantic_breaks = annotations['semantic_breaks']
# texts = []
# for _, txt in concat_section_texts(text, semantic_breaks):
# texts.append(txt)
# texts = extract_samples(texts)
# # search for sample texts
# text_count = len(texts)
# should_min = max(1, int(0.6 * text_count))
# should = []
# for text in texts:
# should.append({'match': {'sections.text': text}})
query = {
'bool': {
'must': {
'nested': {
'path': 'sections',
'query': {'match': {'sections.text': text}},
},
},
'filter': {
'term': {
'site_id': site_id,
},
},
}
}
fields = [
'url',
'sections.text',
'site_id',
]
response = await engine.search(
index=engine.index_base_name + '_text_*',
body={
'query': query,
'fields': fields,
'from': 0,
'size': 3,
'_source': False,
},
)
if response['timed_out']:
return False
for hit in response.get('hits', {}).get('hits'):
txt = ' '.join(hit['fields']['sections.text'])
similarity = SequenceMatcher(None, text, txt).ratio()
if similarity > 0.99:
return hit['_id']
return None

View file

@ -0,0 +1,9 @@
"""
Websites.
"""
from .feeds import fetch_feeds
from .operations import checkin_site, checkout_site, process_site, update_site
from .queue import process_site_queue
from .robots import RobotsInfo
from .seed import load_seeds

View file

@ -0,0 +1,68 @@
"""
Tool for analyzing a website.
Fetch the startpage and output information to console.
Do not change any persistent data.
"""
import asyncio
import logging
import sys
import aiohttp
from ..models import TextResource
from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
from ..site.robots import RobotsInfo
from ..utils.durl import Durl
from .parse import parse_startpage
logger = logging.getLogger()
logger.setLevel(logging.WARNING)
logger.addHandler(logging.StreamHandler())
async def run():
"""
Fetch the startpage of a website and show information about it.
The URL must be given as commandline argument.
"""
base_url = sys.argv[1]
async with aiohttp.ClientSession() as session:
if not (base_durl := await Durl(base_url)):
return
fetcher = ResourceFetcher(session)
resource = await fetcher.fetch(base_url)
logger.warning(repr(resource))
if (
isinstance(resource, TextResource)
and resource.content_type == 'html'
):
site = await parse_startpage(resource)
# site.crawl_enabled = await site_filter(site)
logger.warning(repr(site))
logger.warning('')
for durl, text in site.links_ext.items():
logger.warning(f' {durl} {text}')
logger.warning(f'{durl.url()} -------- {text}')
logger.warning('')
logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
logger.warning('')
robots = await RobotsInfo(base_url)
urls = await get_sitemap_urls(
fetcher, base_url, sitemaps=robots.site_maps
)
paths, latest = extract_sitemap_paths(base_url, urls)
for path in paths:
logger.warning(path)
logger.warning(f'Feeds: {site.feeds}')
logger.warning(latest)
# sample_links = extract_samples(resource.init_fields['links_int'])
# logger.warning(f'************* {sample_links}')
else:
logger.warning('(No text resource or error.)')
if __name__ == '__main__':
asyncio.run(run())

View file

@ -0,0 +1,100 @@
"""
High-level feed-related stuff.
See resource.feed for low-level stuff not primarily related to sites.
"""
from datetime import datetime
from typing import Optional
from ..models import Feed
from ..resource import store_feed_entries, update_feed
async def store_new_feeds(conn, site_id, feeds: dict):
"""
Store new feeds in table site_feed.
"""
sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1"
known_feeds = (await conn.fetchval(sql, site_id)) or []
for feed_url in feeds.keys():
if feed_url not in known_feeds:
feed = Feed(
site_id=site_id,
url=feed_url,
)
await feed.save(conn)
async def get_feeds(conn, site_id) -> list[Feed]:
"""
Return stored feeds for the given site.
"""
sql = "SELECT * FROM site_feed WHERE site_id=$1"
rows = (await conn.fetch(sql, site_id)) or []
return [(await Feed().load_from_row(row)) for row in rows]
async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]:
"""
Fetch feeds, add new resources and return the latest content update time.
"""
feeds = await get_feeds(conn, site.id_)
latest = None
for feed in feeds:
feed_content = await update_feed(fetcher, feed, conn)
if feed_content:
await store_feed_entries(conn, site, feed_content)
if feed.t_content:
latest = max(latest or feed.t_content, feed.t_content)
return latest
if __name__ == '__main__':
# only use this on a dev instance!
import asyncio
import logging
import sys
import aiohttp
from ..config import Config
from ..db import PGPool
from ..resource.fetch import ResourceFetcher
from .operations import process_site, update_site
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
config = Config().get()
url = sys.argv[1]
async def run():
"""
Fetch and display a site.
"""
app = None # TODO
async with PGPool(config['postgresql']) as pool:
async with pool.acquire() as conn:
async with aiohttp.ClientSession() as session:
fetcher = ResourceFetcher(session)
site, _ = await update_site(app, fetcher, conn, url)
logger.warning(site)
await process_site(fetcher, conn, site)
latest = await fetch_feeds(fetcher, conn, site)
logger.warning(f'latest: {latest}')
# feed = Feed(url=url)
# feed_content = await update_feed(fetcher, feed, conn)
# if isinstance(feed_content, ResourceError):
# print(feed_content)
# else:
# print(feed)
# pprint(feed_content[0])
# print('---- 2nd try ----')
# feed_content = await update_feed(fetcher, feed, conn)
# if isinstance(feed_content, ResourceError):
# print(feed_content)
# else:
# print(feed)
# pprint(feed_content[0])
asyncio.run(run())

View file

@ -0,0 +1,267 @@
"""
Operations on sites.
"""
import logging
from datetime import datetime, timedelta
from typing import Optional
from asyncpg import Connection
from ..models import Crawl, Site, TextResource
from ..resource import (
add_site_paths,
extract_sitemap_paths,
get_sitemap_urls,
store_boilerplate_texts,
)
from ..utils.durl import Durl
from ..utils.similarity import get_simhash_index
from .feeds import fetch_feeds, store_new_feeds
from .parse import parse_startpage
from .robots import RobotsInfo
logger = logging.getLogger(__name__)
async def checkout_site(
app, conn: Connection
) -> tuple[Optional[int], bool, bool]:
"""
Get the id of a site to be crawled and mark it with crawl_active=true.
Also return whether the site shall be fully crawled; if not, this
means that just the resources from the feeds shall be crawled.
Also return whether more sites might be available.
"""
async with conn.transaction():
sql = (
"SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
" FROM site WHERE crawl_enabled AND crawl_active = false"
" AND (next_full_crawl < now() at time zone 'UTC'"
" OR next_feed_crawl < now() at time zone 'UTC')"
" LIMIT 1 FOR UPDATE SKIP LOCKED"
)
row = await conn.fetchrow(sql)
if row:
site_id = row['id']
is_full = row['is_full']
sql = "UPDATE site SET crawl_active = true WHERE id=$1"
await conn.execute(sql, site_id)
site = await Site().load(conn, site_id)
if site:
site.base_durl = await Durl(site.base_url)
if site.base_durl:
site.simhash_index = await get_simhash_index(conn, site_id)
return site, is_full, True
else:
# site not available; schedule next crawl
int_full = app.config['crawl']['full_crawl_interval']
int_feed = app.config['crawl']['feed_crawl_interval']
now = datetime.utcnow()
t_full = now + timedelta(seconds=int_full)
t_feed = now + timedelta(seconds=int_full + int_feed)
sql = (
"UPDATE site SET crawl_active=false,"
" next_full_crawl=$1, next_feed_crawl=$2"
" WHERE id=$3"
)
await conn.execute(sql, t_full, t_feed, site_id)
return None, False, True
return None, False, True
return None, False, False
async def update_site(
app, fetcher, conn: Connection, base_url, site: Site = None
) -> tuple[Optional[Site], bool]:
"""
Try to fetch base_url and return a site and whether a new one was created.
This function is run for all sites (including blacklisted and irrelevant
ones). It determines whether the site shall be crawled.
If an errors occurs, return (None, False), and if a site was given,
also set it to crawl_enabled=False and remove crawling schedules.
If base_url could be fetched, update the site, possibly creating
a new one.
If the site has crawl_enabled, and no full crawl is scheduled,
schedule one (by updating column `next_full_crawl`).
"""
# fetch startpage
logger.info(f'Updating site={site}, base_url={base_url}')
resource = await fetcher.fetch(base_url, site=site)
if (
not isinstance(resource, TextResource)
or resource.content_type != 'html'
):
if site:
site.meta_info['error'] = 'Invalid start page'
site.crawl_enabled = False
site.next_full_crawl = None
site.next_feed_crawl = None
await site.save(conn)
logger.info(f'Failed startpage {base_url}: {resource}')
return None, False
# parse startpage (extract site information) and save the site
site = await parse_startpage(resource, app=app, site=site)
site_id, created = await site.save(conn)
if created:
logger.debug(f'Created {site}')
# add black-/white-listing info
is_allowed = await is_site_allowed(conn, site.id_, base_url)
if is_allowed is not None and is_allowed != site.crawl_enabled:
site.crawl_enabled = is_allowed
await site.save(conn)
# schedule full crawl, if none is scheduled and the site shall be crawled
if site.crawl_enabled:
sql = (
"UPDATE site"
" SET next_full_crawl=now() at time zone 'UTC'"
" WHERE id=$1 AND next_full_crawl IS null"
)
await conn.execute(sql, site_id)
return site, created
async def is_site_allowed(
conn: Connection,
site_id: Optional[int],
base_url: str,
) -> Optional[bool]:
"""
Return True if the site is whitelisted, False if blacklisted, else None.
Also add missing site_ids to the annotations.
"""
sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
anns = await conn.fetch(sql, site_id, base_url)
for ann in anns:
if ann['ann_type'] == 'blacklist':
return False
if ann['ann_type'] == 'whitelist':
return True
# add missing site_ids
if site_id and any([ann['site_id'] is None for ann in anns]):
sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
await conn.execute(sql, site_id, base_url)
return None
async def process_site(fetcher, conn: Connection, site: Site):
"""
Process a site: fetch and store more information.
Store external and internal links; find boilerplate texts;
fetch sitemaps; fetch feeds; update date of last publication.
"""
if not site.id_: # only to satisfy typing
return
if site.links_ext:
await _store_cross_site_links(conn, site.id_, site.links_ext)
if site.links_int:
paths = []
for durl, (rel, _) in site.links_int.items():
canon = (rel and rel.lower() == 'canonical') or None
paths.append((durl.pwa(), canon))
await add_site_paths(conn, site.id_, paths)
await store_boilerplate_texts(fetcher, conn, site)
# get sitemaps and add their resources
robots = await RobotsInfo(site.base_url) # type: ignore
urls = await get_sitemap_urls(
fetcher, site.base_url, sitemaps=robots.site_maps
)
paths_, latest = extract_sitemap_paths(site.base_url, urls)
await add_site_paths(conn, site.id_, paths_)
# store feeds and their resources
await store_new_feeds(conn, site.id_, site.feeds)
latest_ = await fetch_feeds(fetcher, conn, site)
if latest_:
latest = max(latest or latest_, latest_)
# update last_pub
if latest:
site.last_pub = latest
await site.save(conn)
async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
"""
Unlock the site and schedule next crawl.
*crawl* is the crawl that has just finished (regularly or stopped).
If the crawl was stopped (t_end is None), just unlock the site.
Otherwise schedule a crawl of the same type. After a full crawl
also a feed crawl is scheduled, if there was none scheduled.
"""
if crawl.t_end is None:
sql = "UPDATE site SET crawl_active=false WHERE id=$1"
await conn.execute(sql, site.id_)
elif crawl.is_full:
full_interval = app.config['crawl']['full_crawl_interval']
feed_interval = app.config['crawl']['feed_crawl_interval']
next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
sql = (
"UPDATE site SET crawl_active=false, next_full_crawl=$1,"
" next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
)
await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
else:
feed_interval = app.config['crawl']['feed_crawl_interval']
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
sql = (
"UPDATE site SET crawl_active=false, next_feed_crawl=$1"
" WHERE id=$2"
)
await conn.execute(sql, next_feed_crawl, site.id_)
async def _store_cross_site_links(
conn: Connection,
site_id: int,
links: dict[Durl, tuple[list[str], str]],
) -> None:
"""
Put outgoing links into site_link/site_queue for existing/unknown sites.
Separate outgoing links from *site_id* into two classes:
(a) existing sites (rows in table site) and (b) unknown links.
Add links from class (a) to table site_link.
Add links from class (b) to table site_queue.
"""
# add outgoing cross-site links for existing sites to table site_link
urls = [url.site() for url in links.keys()]
values = []
sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
if rows := await conn.fetch(sql, urls):
for row in rows:
if (durl := await Durl(row['url'])) in links.keys():
_, link_text = links.pop(durl)
if site_id != row['id']:
values.append((site_id, row['id'], link_text))
sql = (
"INSERT INTO site_link (src, dst, link_text)"
" VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
)
await conn.executemany(sql, values)
# add outgoing cross-site links for unknown sites to table site_queue
sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
values = [
(site_id, durl.site()[:200], link_text[:100])
for durl, (_, link_text) in links.items()
]
await conn.executemany(sql, values)

View file

@ -0,0 +1,255 @@
"""
Parsing of a site's startpage.
"""
import re
from datetime import datetime
from typing import Any, Optional
from ..models import Site, TextResource
from ..resource import feed_types
from ..utils.durl import Durl, get_ips
from ..utils.html import clean_html
from ..utils.lang import clean_lang
from ..utils.link import (
extract_domain,
in_blacklist,
link_rels,
meta_names,
meta_props,
)
re_meta_keyword_sep = re.compile('[,;\r\n]')
def cut_str(s: Optional[str], l: int) -> Optional[str]:
"""
Cut a string *s* to a maximal length *l* from the left.
"""
return s[:l] if s else None
async def parse_startpage(
startpage: TextResource, app=None, site=None
) -> Site:
"""
Parse a site's startpage and return a Site instance.
If a site instance is given, update it.
"""
durl = startpage.init_fields['durl']
soup = startpage.init_fields['head']
meta = collect_meta_tags(soup)
meta_links = await collect_meta_links(soup, durl)
links_ext = await collect_external_links(startpage, meta_links)
links_int = startpage.init_fields['links_int']
langs = extract_languages(startpage, meta, meta_links)
title, description, keywords = extract_meta_texts(startpage, meta)
# feeds
feeds = meta_links['feeds']
if 'wordpress' in meta.get('generator', '').lower():
url = durl.site() + 'feed/'
feeds[url] = 'application/rss+xml'
# TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
# network params (canonical_url, base_urls, domains)
ips = await get_ips(durl.hostname)
redirects = []
for redirect in startpage.init_fields['redirects']:
redir_url = await Durl(redirect)
if redir_url:
redirects.append(redir_url.site())
base_urls = redirects + [durl.url()]
domains = [extract_domain(durl.hostname)]
if site: # update an existing Site
site.canonical_url = meta_links['canonical_url'] or site.canonical_url
site.base_urls = base_urls
site.domains = domains
site.ips = ips
site.last_update = datetime.utcnow()
site.last_pub = startpage.last_change
site.langs = langs
site.alt_langs = meta_links['alt_langs']
site.title = title
site.description = description
site.keywords = keywords
site.linkbacks.update(meta_links['linkbacks'])
site.meta_info = meta
site.__post_init__(
base_durl=durl,
feeds=feeds,
links_ext=links_ext,
links_int=links_int,
startpage_text=startpage.search_fields['text'],
)
else: # create new Site instance
site = Site(
# post_init fields
base_durl=durl,
feeds=feeds,
links_ext=links_ext,
links_int=links_int,
startpage_text=startpage.search_fields['text'],
# dataclass fields
canonical_url=meta_links['canonical_url'],
base_urls=base_urls,
domains=domains,
ips=ips,
last_update=datetime.utcnow(),
last_pub=startpage.last_change,
langs=list(langs),
alt_langs=meta_links['alt_langs'],
title=title,
description=description,
keywords=keywords,
linkbacks=meta_links['linkbacks'],
meta_info=meta,
)
if site.ips is None and site.url:
site.ips = await get_ips(site.url.hostname)
if app and site.startpage_text:
site_filter = app.plugins['filter_site'].site_filter
site.crawl_enabled = await site_filter(site)
return site
def collect_meta_tags(soup):
"""
Collect selected meta tags (meta_names and meta_props) with their values.
"""
meta = {}
for tag in soup.find_all('meta'):
if (name := tag.get('name')) and name in meta_names:
meta[name] = tag.get('content')
if (property := tag.get('property')) in meta_props:
if content := tag.get('content'):
meta[property] = content
if tag.get('http-equiv') == 'content-language': # old html
if content := tag.get('content'):
meta['http_equiv_lang'] = content
return meta
async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
"""
Collect link tags with site scope (feeds, linkbacks, canonical, ...).
"""
linkbacks = {}
feeds = {}
alt_langs = {}
canonical_url = None
for tag in soup.find_all('link'):
if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
continue
if not (url := tag.get('href')):
continue
if not (link_durl := await Durl(url, base=base_durl)):
continue
if in_blacklist(link_durl.hostname):
continue
link_url = link_durl.url()
link_type = tag.get('type')
if link_type in feed_types:
feeds[link_url] = link_type
elif 'canonical' in rels:
canonical_url = link_url
elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
if lang := clean_lang(hreflang):
alt_langs[lang] = link_durl.url()
elif 'webmention' in rels:
linkbacks[link_url] = 'webmention'
elif 'pingback' in rels:
linkbacks[link_url] = 'pingback'
if canonical_url:
if canonical_durl := await Durl(canonical_url):
canonical_url = canonical_durl.site()
else:
canonical_url = None
return {
'feeds': feeds,
'linkbacks': linkbacks,
'alt_langs': alt_langs,
'canonical_url': canonical_url,
}
async def collect_external_links(startpage, meta_links) -> dict[str, str]:
"""
Return external links (mapping from URL to link text) from startpage.
Also add links to alternate language variants of the site.
"""
external_links = startpage.init_fields['links_ext'].copy()
netloc = startpage.init_fields['durl'].netloc
for lang, lang_url in meta_links['alt_langs'].items():
if netloc not in lang_url:
durl = await Durl(lang_url)
if durl:
external_links[durl] = f'Alternate language: {lang}'
return external_links
def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
"""
Extract and return title, description, keywords from a page and meta tags.
"""
title = meta.get('og:site_name')
if not title:
title = page.search_fields['title'] or ''
if meta_title := meta.pop('title', None):
if meta_title.lower() not in title.lower():
title += ('; ' if title else '') + meta_title
title = cut_str(clean_html(title), 200)
description = cut_str(clean_html(meta.pop('description', None)), 2000)
if meta_keywords := meta.pop('keywords', None):
kws = re_meta_keyword_sep.split(meta_keywords)
keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
if len(keywords) < 2:
keywords = [
kw.strip()[:50]
for kw in meta_keywords.split(' ')
if kw.strip()
]
else:
keywords = []
return title, description, keywords
def extract_languages(page, meta, meta_links) -> set[str]:
"""
Extract languages from a page's html tag, meta tags and HTTP headers.
Also add the language detected in the text content of the page.
Return a set of ISO 639-1 language codes.
See also https://www.w3.org/International/questions/qa-http-and-lang and
https://www.w3.org/International/questions/qa-html-language-declarations
"""
languages = set()
if lang := clean_lang(page.lang):
languages.add(lang)
if lang := clean_lang(meta.get('http_equiv_lang')):
languages.add(lang)
if lang := clean_lang(meta.get('dc.language')):
languages.add(lang)
if lang := clean_lang(meta.get('og:locale')):
languages.add(lang)
for lang, lang_url in meta_links['alt_langs'].items():
if page.init_fields['durl'].netloc in lang_url:
if lng := clean_lang(lang):
languages.add(lng)
lngs = (
page.init_fields['headers']
.get('Content-Language', '')
.lower()
.replace(' ', '')
.split(',')
)
for lng in lngs:
if lang := clean_lang(lng):
languages.add(lang)
languages.add(page.lang)
return languages

View file

@ -0,0 +1,127 @@
"""
Queue of sites.
When processing a resource, its external links are put into database table
`site_queue`.
The items in `site_queue` are processed in :func:`process_site_queue`.
This is done baseURL by baseURL (see :func:`iter_site_queue`).
While doing this, cross-site links are put into table `site_link`.
"""
import logging
from typing import AsyncIterator, Optional
import aiohttp
from asyncpg import Connection
from ..resource import ResourceFetcher
from .operations import update_site
logger = logging.getLogger(__name__)
async def process_site_queue(app, pool):
"""
Loop over queued sites creating new sites and adding cross-site links.
"""
site_delay = app.config['crawl']['site_delay']
resource_delay = app.config['crawl']['resource_delay']
async with pool.acquire() as conn:
async with aiohttp.ClientSession() as session:
fetcher = ResourceFetcher(session)
while app.running:
async for base_url, links_from in iter_site_queue(app, conn):
# get or create site
msg = f'Site queue: updating {base_url}'
logger.debug(msg)
site, created = await update_site(
app, fetcher, conn, base_url
)
if site:
await store_incoming_site_site_links(
conn, site.id_, links_from
)
# delete handled queue items
sql = "DELETE FROM site_queue WHERE url=$1"
await conn.execute(sql, base_url)
await app.sleep(resource_delay)
logger.debug(
f'Queued sites exhausted, sleeping'
f' for {site_delay} seconds'
)
await app.sleep(site_delay)
async def iter_site_queue(
app, conn: Connection
) -> AsyncIterator[tuple[str, dict[int, str]]]:
"""
Yield URLs with aggregated link information from site_queue.
Yield a URL and a dict mapping ids of linking sites to link texts.
"""
site_revisit_interval = app.config['crawl']['site_revisit_interval']
while app.running:
sql = (
"SELECT url, array_agg(src) srcs,"
" array_agg(link_text) link_texts"
" FROM site_queue GROUP BY url LIMIT 1"
)
row = await conn.fetchrow(sql)
if row:
base_url = row['url']
links_from = {}
srcs = row['srcs']
link_texts = row['link_texts']
for i in range(len(srcs)):
if src := srcs[i]:
links_from[src] = link_texts[i]
if site_id := await site_recently_updated(
conn, base_url, site_revisit_interval
):
# just store incoming links and remove the site from the queue
await store_incoming_site_site_links(conn, site_id, links_from)
sql = "DELETE FROM site_queue WHERE url=$1"
await conn.execute(sql, base_url)
else:
yield base_url, links_from
else:
break
async def site_recently_updated(
conn: Connection,
base_url: str,
site_revisit_interval: float,
) -> Optional[int]:
"""
Return the id of the site with given base_url if it was updated recently.
"""
sql = (
f"SELECT id FROM site WHERE $1=any(base_urls)"
f" AND last_update + interval '{site_revisit_interval} seconds'"
f" > now() at time zone 'utc' LIMIT 1"
)
site_id = await conn.fetchval(sql, base_url)
return site_id
async def store_incoming_site_site_links(
conn: Connection, site_id: int, links_from: dict
):
"""
Store incoming site-site links (irrespective of crawl_enabled).
*site_id* is the id of the site to which the links in *links_from* point.
"""
sql = (
"INSERT INTO site_link"
" (src, dst, link_text) VALUES ($1, $2, $3)"
" ON CONFLICT (src, dst) DO NOTHING"
)
values = [
(from_id, site_id, link_text)
for from_id, link_text in links_from.items()
if from_id != site_id
]
await conn.executemany(sql, values)

View file

@ -0,0 +1,98 @@
"""
Fetch and evaluate a website's robots.txt.
"""
import logging
from typing import Optional, Union
from urllib.robotparser import RobotFileParser
import aiohttp
logger = logging.getLogger(__name__)
class RobotsInfo(RobotFileParser):
"""
Obtain information from a site's robots.txt.
After instantiation you must await :meth:`startup`.
"""
def __init__(
self,
site_url: str,
user_agent: str = '*',
session: aiohttp.ClientSession = None,
):
super().__init__()
self.__user_agent = user_agent
self.__site_url = site_url.rstrip('/')
self.__robots_url = self.__site_url + '/robots.txt'
self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
self.__session = session
def __await__(self):
return self.__ainit__().__await__()
async def __ainit__(self):
if self.__session:
content = await self.__get_robots_txt(self.__session)
else:
async with aiohttp.ClientSession() as session:
content = await self.__get_robots_txt(session)
self.parse(content.splitlines())
self.__delay = self.crawl_delay(self.__user_agent)
request_rate = self.request_rate(self.__user_agent)
if request_rate:
self.__delay = request_rate.seconds / request_rate.requests
self.__site_maps = super().site_maps() or []
return self
async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
"""
Fetch and return the robots.txt over http.
"""
try:
async with session.get(
self.__robots_url, timeout=self.__timeout
) as resp:
if resp.status == 200:
try:
content = await resp.text()
except:
body = await resp.read()
content = body.decode(
resp.charset or 'utf-8', errors='ignore'
)
else:
content = ''
except aiohttp.ClientError:
content = ''
return content
@property
def user_agent(self) -> str:
"""
The user agent being used.
"""
return self.__user_agent
@property
def delay(self) -> Optional[Union[int, float]]:
"""
The delay to be used between requests.
"""
return self.__delay
@property
def site_maps(self) -> list[str]:
"""
The list of sitemaps of the site.
"""
return self.__site_maps
def can_fetch_url(self, url: str) -> bool:
"""
Return whether fetching of the given *url* is allowed.
"""
return super().can_fetch(self.__user_agent, url)

View file

@ -0,0 +1,72 @@
"""
Seeding of new installations with URLs from blacklists and whitelists.
"""
from pathlib import Path
import asyncpg
from ..utils.durl import Durl
async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
"""
Add seed file contents (site blacklist and whitelist).
If there are sites already, do nothing.
"""
async with pool.acquire() as conn:
site_count = await conn.fetchval("SELECT count(*) FROM site")
if site_count:
return
# add blacklist entries
values = []
blacklist = _load_list(config['config_dir'], 'black')
for base_url in blacklist:
durl = await Durl(base_url)
if durl:
url = durl.site()
values.append((url, {'source': 'seed file'}))
sql = (
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
" VALUES ($1, 'blacklist', $2)"
)
await conn.executemany(sql, values)
# add whitelist entries
values1 = []
values2 = []
whitelist = _load_list(config['config_dir'], 'white')
for base_url in whitelist:
durl = await Durl(base_url)
if durl:
url = durl.site()
if url not in blacklist:
values1.append((url, {'source': 'seed file'}))
values2.append((url,))
sql = (
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
" VALUES ($1, 'whitelist', $2)"
)
await conn.executemany(sql, values1)
sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
await conn.executemany(sql, values2)
def _load_list(config_dir, black_white):
"""
Load the seed black or white list.
"""
path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
with open(path, 'r') as list_file:
urls = []
for line in list_file.read().strip().splitlines():
line_ = line.strip()
if line_.startswith('#'):
continue
if black_white == 'black' and line_.startswith('-'):
urls.append(line_[1:].strip())
if black_white == 'white' and line_.startswith('+'):
urls.append(line_[1:].strip())
return urls

View file

@ -0,0 +1,69 @@
"""
Query the tensorflow_model_server's REST API.
"""
import logging
from typing import Optional, Union
import aiohttp
logger = logging.getLogger(__name__)
class TensorFlow:
"""
Fetch an embedding vector from the tensorflow model server.
"""
def __init__(
self,
app,
session: aiohttp.ClientSession,
timeout_sock_connect: Union[int, float] = 0.5,
timeout_sock_read: Union[int, float] = 10,
):
self.config = app.config['tensorflow']
self.session = session
self.timeout = aiohttp.ClientTimeout(
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
)
async def embed(
self, text: Union[str, list[str]]
) -> Optional[Union[list[float], list[list[float]]]]:
"""
Query the tensorflow_model_server's REST API for a prediction.
Take a string or a list of strings and return an embedding vector
or a list of embedding vectors.
If the request fails or times out, return None.
"""
text_ = text if isinstance(text, list) else [text]
data = {'signature_name': 'serving_default', 'instances': text_}
try:
async with self.session.post(
self.config['model_server_endpoint'],
json=data,
timeout=self.timeout,
) as resp:
try:
res = await resp.json()
if isinstance(text, list):
return res.get('predictions')
else:
return res.get('predictions')[0]
except:
msg = 'Got invalid response from tensorflow'
logger.error(msg)
return None
except Exception as err:
msg = 'Could not get embedding from tensorflow for '
if isinstance(text, str):
msg += f'string of length {len(text)}'
else:
msg += 'list of strings with lengths '
msg += ','.join([str(len(s)) for s in text])
msg += f', reason: {err}'
logger.error(msg)
return None

View file

View file

@ -0,0 +1,481 @@
"""
Convert html to plain text with annotations over character ranges.
"""
import re
from collections import defaultdict
from html.parser import HTMLParser
from .json import json_dumps, json_loads
from .link import nofollow_link_rels
from .tag import keep_tags, self_closing_tags
MAX_HREF_LENGTH = 200
"""
Maximum length of an href. Other links are discarded.
"""
text_blacklist = [
'previous',
'next',
'back', # common pagination navigation
'↩︎', # amusewiki footnote separator (after conversion from muse to html)
]
"""
Texts to ignore.
"""
class AnnotatingParser(HTMLParser):
"""
Parse tagged text resulting in pure text and annotations.
The text is available in self.text and the annotations
in self.annotations, which is a dict with these keys:
* tags: contains a mapping of offset ranges (i, f) to
the tags opening at i and closing at f
* semantic_breaks: a mapping of offset positions where
a new section begins to the nesting level of that
sections; a section is whereever an (opening or closing)
separating tag is placed in the raw html; for the
separating flag of tags see tag.py
* links: a mapping of hrefs to link texts obtained from
anchor (a) tags; we skip hyperref with nofollow rels
* section_ids: map an offset position to the first
id attribute (of any tag) at the beginning of a
semantic section; this can later be used in a URL
fragment for linking directly into this section
Internally, we put opening tags on self.stack and pop them
when the first matching closing tag is encountered. We assume
balanced tags (tidy html).
NB: all tags with semantic breaks have sep=True, i.e.,
they will have spaces around them so that the semantic breaks
always sit on a space; the semantic break position p is the end
of the last section and the next sections begins at p + 1.
The text alway begins with a ' ' (added if not in the original),
which is assigned a semantic break with default level 80
(if there is no semantic break tag at the beginning).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.text = ' ' # concatenated text data (without tags)
self.pos = 1 # equal to len(self.text)
self.stack = []
self.tags = defaultdict(dict)
self.semantic_breaks = {0: 80}
self.tag_id = None
self.section_ids = defaultdict(list)
self.links = {}
self.add_space = False
def close(self):
"""
Finish by collecting results in dict `self.annotations`.
"""
super().close()
self.annotations = {}
self.annotations['links'] = self.links
self.annotations['semantic_breaks'] = {
pos: lvl for pos, lvl in sorted(self.semantic_breaks.items())
}
self.annotations['tags'] = self.tags
self.annotations['section_ids'] = self.section_ids
def handle_starttag(self, tag, attrs):
"""
Called for each opening tag.
"""
sep, lvl, sem = keep_tags[tag]
attrs = dict(attrs)
if sep:
self.add_space = True
if tag == 'section' and 'endnotes' in attrs.get('role', ''):
lvl = 25
# ARIA roles
if role := attrs.get('role'):
if role == 'article':
lvl = 15
elif role == 'heading':
if aria_level := attrs.get('aria-level'):
if aria_level in (1, 2, 3, 4, 5, 6):
sep, lvl, sem = keep_tags[f'h{aria_level}']
elif role == 'region':
lvl = 24
i = self.pos
if tag in self_closing_tags:
# self-closing tags will not be added to the result tags,
# they only appear in semantic_breaks
# the two self-closing tags br and hr both have lvl and sep
if i == 1: # replace the default semantic break at pos 0
i = 0
self.add_semantic_break(i, lvl)
i += 1
if tag_id := attrs.get('id'):
self.tag_id = i, tag_id
self.add_tag_id(i) # br or hr may have an id, too
self.add_space = True
else:
self.stack.append((i, tag, sep, lvl, sem, attrs))
# forget outdated tag id at new semantic break
if lvl:
self.forget_tag_id()
# memorize tag id
if not self.tag_id and (tag_id := attrs.get('id')):
self.tag_id = self.pos, tag_id
def handle_endtag(self, tag):
"""
Called for each closing tag.
"""
if not self.stack or (self.stack and self.stack[-1][1] != tag):
return # nothing to do for an already closed self-closing tag
i, tag_, sep, lvl, sem, attrs = self.stack.pop()
f = self.pos
# omit tag without content
if i == f:
return
# for a closing div tag revise lvl to minimum level of contained
# semantic breaks (if any)
if tag == 'div':
min_lvl = 101
for pos_, lvl_ in reversed(self.semantic_breaks.items()):
if pos_ <= i:
break
min_lvl = min(min_lvl, lvl_)
if min_lvl < 101:
lvl = min_lvl
# add semantic break and an optional section_id
if lvl:
if i == 1: # replace the default semantic break at pos 0
i = 0
if tag in ('ul', 'ol', 'li'):
seen_tags = [x[1] for x in self.stack]
if 'p' not in seen_tags:
lvl = 52 + seen_tags.count('tag')
if tag == 'li':
lvl += 1
self.add_semantic_break(i, lvl)
self.add_tag_id(i)
# do not include surrounding spaces in tag span
if self.text[i] == ' ':
i += 1
# add tag
self.tags[(i, f)][tag] = sem
# add space (when handling next data)
if sep:
self.add_space = True
# collect links
if tag == 'a':
self.extract_link(i, attrs)
def handle_data(self, text):
"""
Called for each non-tag content between tags.
"""
# handle empty or blacklisted text
if text == '':
return
if text == ' ':
self.add_space = True
return
if text.strip().lower() in text_blacklist:
if ' ' in text:
self.add_space = True
return
# add a space (at self.pos) if the text begins with one
# or if we shall add one
startswith_space = text.startswith(' ')
text = text.lstrip()
if startswith_space or self.add_space:
if self.text[-1] != ' ':
self.text += ' '
self.pos += 1
self.add_space = False
# strip a space at the end of text and handle it in end tag
if text.endswith(' '):
text = text[:-1]
self.add_space = True
# add text to self.text
self.text += text
self.pos += len(text)
def add_semantic_break(self, pos, lvl):
"""
Add a semantic break of level *lvl* at position *pos*.
"""
if pos in self.semantic_breaks:
self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl)
else:
self.semantic_breaks[pos] = lvl
def forget_tag_id(self):
"""
Reset a tag id if it is too far behind in the text stream.
"""
if self.tag_id:
pos_, tag_id = self.tag_id
if pos_ + 200 < self.pos:
self.tag_id = None
def add_tag_id(self, pos):
"""
Add and clear an id if the just closing section has none yet.
*pos* is the start position of the current section, and the
position where the id will be added.
Add an id only if we are not too far in the section's text already.
"""
if self.tag_id:
pos_, tag_id = self.tag_id
if pos_ < pos + 100 and pos not in self.section_ids:
self.section_ids[pos].append(tag_id.lower())
self.tag_id = None
def extract_link(self, i, attrs):
"""
Add a link covering character range (i, self.pos).
From html *attrs* extract href and rel.
"""
if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow':
if href.startswith('#'):
return
if len(href) > MAX_HREF_LENGTH:
return
attrs.get('title', '')
if rel := attrs.get('rel'):
if set(rel) & nofollow_link_rels:
return
self.links[href] = i, self.pos, rel
def annotate(html):
"""
Split html text into plain text with annotations (from AnnotatingParser).
"""
parser = AnnotatingParser()
parser.reset()
parser.feed(html)
parser.close()
return parser.text, parser.annotations
re_footnote = re.compile(r'^\s*\[\d+\]\s+')
def headline_probability(text, tags, lvl) -> float:
"""
Estimate the probability that the text with tags is a headline.
The context is not considered: The question is not whether the
text is a headline for the following text.
"""
text = text.strip()
res = 0.0
if not text:
return res
if lvl < 60:
return 1.0
# if 'h1' in tags or 'h2' in tags or 'h3' in tags or\
# 'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags:
# return 1.0
if len(text) < 80:
res = 0.7
else:
res = 0.7 - 0.7 * (len(text) - 80) / 200
if 'p' in tags:
res -= 0.4
if 'em' in tags:
res += 0.3
if 'a' in tags:
res -= 0.1
if text[-1] in '.:':
res -= 0.3
res -= 0.1 * text.count(', ')
if re_footnote.match(text):
res -= 0.4
return max(res, 0.0)
def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]:
"""
Return the info on the share of characters covered with one of the *tags*.
Only consider the characters between i and f of string *text*.
Return the number of tags that have an overlap in the specified region,
the tag density in the region (fraction of covered characters by all),
and the average number of covered chars per tag.
NB: If more than one tag name is given, then the fractional share
may exceed 1.
"""
if i == f:
return 0, 0.0, 0.0
tag_count = 0
covered_chars = 0
for (s_i, s_f), anns in tags.items():
if overlap := range_overlap(i, f - 1, s_i, s_f - 1):
for ann in anns:
if ann in tag_names:
tag_count += 1
covered_chars += overlap[1] - overlap[0]
all_chars = f - i
tag_density = covered_chars * 1.0 / all_chars
avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0
return tag_count, tag_density, avg_text_len
def range_overlap(i1, f1, i2, f2):
"""
Return the overlap of both ranges (None if there is none).
"""
return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2))
def annotations_remove_section(annotations, i, f):
"""
Remove section (i, f) from annotations and return result.
"""
new_annotations = {}
d = f - i
if not d:
return annotations
# relocate tags
new_tags = {}
for (t_i, t_f), anns in annotations['tags'].items():
n_i, n_f = cut_range(i, f, d, t_i, t_f)
if n_i is not None:
new_tags[(n_i, n_f)] = anns
new_annotations['tags'] = new_tags
# relocate links
new_links = {}
for href, (l_i, l_f, rel) in annotations['links'].items():
n_i, n_f = cut_range(i, f, d, l_i, l_f)
if n_i is not None:
new_links[href] = n_i, n_f, rel
# relocate semantic breaks and section_ids
semantic_breaks = annotations['semantic_breaks']
section_ids = annotations['section_ids']
new_semantic_breaks = {}
new_section_ids = {}
for pos in sorted(semantic_breaks.keys()):
level = semantic_breaks[pos]
if i <= pos and pos < f:
continue # discard
elif f <= pos:
new_semantic_breaks[pos - d] = level
if pos in section_ids:
new_section_ids[pos - d] = section_ids[pos]
else:
new_semantic_breaks[pos] = level
if pos in section_ids:
new_section_ids[pos] = section_ids[pos]
# collect and return results
new_annotations['semantic_breaks'] = new_semantic_breaks
new_annotations['section_ids'] = new_section_ids
new_annotations['links'] = new_links
return new_annotations
def cut_range(i, f, d, t_i, t_f):
"""
Return the new coordinates of a text range (t_i,t_f) after cutting (i,f).
If (t_i,t_f) is fully within (i,f), return None, None.
"""
if t_f < i:
return t_i, t_f
elif t_i < i <= t_f <= f:
return t_i, i
elif t_i < i and f <= t_f:
return t_i, t_f - d
elif i <= t_i and t_f <= f:
return None, None
elif i <= t_i <= f < t_f:
return i, t_f - d
else: # f < t_i
return t_i - d, t_f - d
def clean_annotations(annotations: dict) -> None:
"""
Remove void stuff from annotations.
"""
cleaned_tags = {}
for (i, f), anns in annotations['tags'].items():
if f > i and anns:
cleaned_tags[(i, f)] = anns
annotations['tags'] = cleaned_tags
def pack_annotations(annotations):
"""
Pack annotations to a special JSON string, reducing their volume a little.
"""
return json_dumps(
{
'tags': _pack_tags(annotations['tags']),
'semantic_breaks': ','.join(
[
f'{pos}:{level}'
for pos, level in annotations['semantic_breaks'].items()
]
),
'section_ids': annotations['section_ids'],
'links': annotations['links'],
}
)
def _pack_tags(tags: dict) -> str:
"""
Utility function for packing tag information into a string.
"""
res = ''
for (i, f), anns in tags.items():
if anns:
anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()])
res += f'{i}-{f}:{anns_}\n'
return res
def unpack_annotations(json_text: str) -> dict:
"""
Unpack tag information from a string.
"""
annotations = json_loads(json_text)
tags = {}
for line in annotations['tags'].split('\n'):
if line:
range_, anns_ = line.split(':')
i, f = range_.split('-')
i = int(i)
f = int(f)
anns = {}
if anns_:
for ann_ in anns_.split(','):
tag_, sem_ = ann_.split('=')
anns[tag_] = sem_
tags[(i, f)] = anns
semantic_breaks = {}
for sb_ in annotations['semantic_breaks'].split(','):
pos_, lvl_ = sb_.split(':')
semantic_breaks[int(pos_)] = int(lvl_)
return {
'tags': tags,
'semantic_breaks': semantic_breaks,
'section_ids': annotations['section_ids'],
'links': annotations['links'],
}

View file

@ -0,0 +1,90 @@
"""
Find date expressions in a string.
"""
import re
from datetime import datetime
from typing import Optional
p_day = r'(0?[1-9]|[12][0-9]|3[01])'
p_month = r'(0?[1-9]|1[0-2])'
p_year = r'(20\d\d|19\d\d)'
sep = r'\D{1,2}'
p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
format_re = {
'iso': (
re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
(1, 2, 3, 6, 7),
),
'dmy': (
re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
(3, 2, 1, 6, 7),
),
'mdy': (
re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
(3, 1, 2, 6, 7),
),
}
lang_format = {
'de': ('iso', 'dmy'),
'en': ('iso', 'mdy'),
None: ('iso', 'dmy', 'mdy'),
}
def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
"""
Extract the latest date compatible with the *lang* from *text*.
Only consider dates in the past.
"""
dates = extract_dates(text, lang=lang)
return max(dates) if dates else None
def extract_dates(text: str, lang: str = None) -> list[datetime]:
"""
Extract dates form a string, optionally limiting formats to a language.
"""
dates = []
fmts = lang_format.get(lang, lang_format[None])
for fmt in fmts:
re_, slots = format_re[fmt]
matches = re_.findall(text)
if matches:
for match in matches:
try:
date = datetime(
int(match[slots[0]]),
int(match[slots[1]]),
int(match[slots[2]]),
int(match[slots[3]] or 0),
int(match[slots[4]] or 0),
)
if date <= datetime.utcnow():
dates.append(date)
except:
pass
return dates
## from htmldate import find_date
# def extract_last_pub(html):
# """
# Return an estimate for the time of last content publication from html.
# """
# # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
# lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
# # publication date (from startpage)
# try:
# date_string = find_date(lxml_tree)
# pd = date.fromisoformat(date_string)
# last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
# except:
# last_pub = None
# return last_pub

View file

@ -0,0 +1,278 @@
"""
Hyperlink parsing.
"""
import logging
from typing import Optional
from urllib.parse import urlsplit
import tldextract
from async_dns import types
from async_dns.resolver import ProxyResolver
from async_lru import alru_cache
from .link import in_blacklist
logger = logging.getLogger(__name__)
resolver = ProxyResolver(request_timeout=2)
async_dns_logger = logging.getLogger('async_dns')
async_dns_logger.setLevel(logging.WARNING)
extract = tldextract.TLDExtract(cache_dir=False)
# tldextract uses filelock; set its loglevel to warning
filelock_logger = logging.getLogger('filelock')
filelock_logger.setLevel(logging.WARNING)
class Durl:
"""
Decomposed URL, contains :class:`urllib.parse.SplitResult`.
When constructing this class, it has to be awaited, e.g.:
my_durl = await Durl('http://www.example.com/whatever')
The given URL will be decomposed, validated and normalized.
If the URL is invalid, we return None instead of an instance.
If the given *base* is None, the URL must be absolute and
the hostname must be valid (DNS lookup).
If the given URL is not absolute, an already decomposed (and thus
valid) *base* Durl must be given; otherwise the URL is invalid.
The *base* Durl can contain a path (but no arguments or fragments),
in which case the URL - if not absolute - must begin with this path.
The scheme must be http or https. If the URL begins with '//',
'http:' is prepended.
If the hostname is longer than 90 characters, the URL is invalid.
Default port numbers (80 for http, 443 for https) are removed.
The hostname is changed to lower case. Spaces in the hostname
make the URL invalid.
URL fragments are removed.
"""
_url = None
_base = None
_match_base = False
def __init__(
self,
url: str,
base: Optional['Durl'] = None,
match_base: bool = False,
):
self._url = url
self._base = base
self._match_base = match_base
def __await__(self):
return self.__ainit__().__await__()
async def __ainit__(self):
res = None
try:
# add missing scheme for urls beginning with '//'
if self._url.startswith('//'):
self._url = 'http:' + self._url
# split the url
durl = urlsplit(self._url)
# remove default port numbers 80, 443
netloc = durl.netloc
if durl.port == 80 and durl.scheme == 'http':
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
if durl.port == 443 and durl.scheme == 'https':
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
if durl.hostname and durl.hostname != durl.netloc.lower():
user_pass = ''
if durl.username and durl.password:
user_pass = f'{durl.username}:{durl.password}@'
port = ''
if durl.port:
port = f':{durl.port}'
netloc = f'{user_pass}{durl.hostname.lower()}{port}'
durl = durl._replace(netloc=netloc)
if self._base:
# if missing fill in scheme and netloc from base
if not durl.scheme:
durl = durl._replace(scheme=self._base.scheme)
if not durl.netloc:
durl = durl._replace(netloc=self._base.netloc)
# if match_base, then set res only if the
# url is compatible with base url
if not self._match_base:
res = durl
else:
if durl.netloc == self._base.netloc:
if durl.scheme == self._base.scheme:
if self._base.path not in ('/', ''):
if durl.path.startswith(self._base.path):
res = durl
else:
res = durl
else:
res = durl
except:
logger.exception(
f'Durl init failed url={self._url}'
f' base={self._base} match_base={self._match_base}'
)
res = None
if res:
res = res._replace(fragment='')
if not res.hostname or len(res.hostname) > 90:
res = None
elif res.scheme not in ('https', 'http'):
res = None
elif ' ' in res.hostname or '.' not in res.hostname:
res = None
elif not (await get_ips(res.hostname)):
res = None
elif not res.path.startswith('/'):
res = res._replace(path='/')
if res:
if res.fragment is None:
res.fragment = ''
self._durl = res
return self
self._durl = None
def __getattr__(self, attr):
return getattr(self._durl, attr)
def url(self) -> str:
"""
Return the URL as string.
"""
return self._durl.geturl()
def pwa(self) -> str:
"""
Return the (base-relative) path with args of the Durl.
"""
if self._base and self._match_base:
path = self._durl.path.removeprefix(self._base.path)
else:
path = self._durl.path
qs = f'?{self._durl.query}' if self._durl.query else ''
return f'{path}{qs}'.lstrip('/')
def has_path(self) -> bool:
"""
Return whether the Durl has a non-trivil path.
"""
return self._durl.path not in ('/', '')
def site(self) -> str:
"""
Return the site (base_url).
"""
return f'{self._durl.scheme}://{self._durl.netloc}/'
def domain(self) -> str:
"""
Return the domain of the Durl (wrong in case of second-level domains).
"""
levels = extract(self._durl.hostname)
return '.'.join(levels[-2:]).lower()
def replace_scheme(self, scheme: str) -> None:
"""
Replace the scheme (must be 'http' or 'https').
"""
self._durl = self._durl._replace(scheme=scheme)
@alru_cache(maxsize=1000)
async def get_ips(hostname: str) -> set[str]:
"""
Return IPv4 and IPv6 addresses of the given hostname.
"""
ips = set()
for type_ in (types.A, types.AAAA):
try:
res, cached = await resolver.query(hostname, type_)
if res:
if addr := res.get_record([type_]):
ips.add(addr.data)
except:
pass
return ips
def get_url_variants(url: str) -> list[str]:
"""
Return variants of the URL.
Replace http with https and vice versa;
prepend or remove 'www.' to or from the beginning of the hostname.
"""
if url.startswith('http://www.'):
s = url.removeprefix('http://www.')
return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
elif url.startswith('http://'):
s = url.removeprefix('http://')
return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
elif url.startswith('https://www.'):
s = url.removeprefix('https://www.')
return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
elif url.startswith('https://'):
s = url.removeprefix('https://')
return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
else:
return [url]
async def assort_links(
links: dict[str, tuple[int, int, list[str]]],
durl: Durl,
text: str,
base_url: str = None,
) -> tuple[
dict[str, tuple[int, int, list[str]]],
dict[Durl, tuple[list[str], str]],
dict[Durl, tuple[list[str], str]],
]:
"""
Sort links into a cleaned, an internal and an external dict.
The cleaned dict maps absolute URLs to char ranges and relations.
The internal dict maps absolute URLs to relations and the linked text.
The external dict maps absolute URLs to relations and the linked text.
The relations are link relations, e.g. rel="canonical".
The base_url is set, it is used to distinguish internal and external
links. If it is not set, the base_url is obtained from *durl*.
"""
res_int = {}
res_ext = {}
if not base_url:
base_url = durl.site().lower()
base_durl = await Durl(base_url)
cleaned_links = {}
for href, (i, f, rel) in links.items():
durl = await Durl(href, base=base_durl)
if not durl:
continue
if durl.hostname and in_blacklist(durl.hostname):
continue
cleaned_links[durl.url()] = i, f, rel
txt = text[i:f]
if durl.site().lower() == base_url:
res_int[durl] = rel, txt
else:
res_ext[durl] = rel, txt
return cleaned_links, res_int, res_ext

View file

@ -0,0 +1,136 @@
"""
Utilities for extracting information from html.
"""
import re
from html import unescape
from typing import Optional
from bs4 import BeautifulSoup
from .lang import clean_lang
from .tag import drop_roles, drop_tags, keep_tags
re_ = {
'html_lang': re.compile(
'<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
),
'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
'strip': re.compile(
'<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
),
'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
'whitespace': re.compile('(\s|&nbsp;)+', re.S),
'whitespace_': re.compile('\s|&nbsp;?'), # allow broken &nbsp
'whitespace_near_tag': re.compile(
'\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
'|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
re.S,
),
'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
}
def whitespace_tag_tag(match_obj):
"""
Helper function for removing whitespace between tags.
"""
return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
def clean_html(s: Optional[str]) -> Optional[str]:
"""
Clean an html string.
Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
See also: https://www.lesinskis.com/python-unicode-whitespace.html
"""
return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
def get_html_lang(html: str) -> Optional[str]:
"""
Return the language, if any, found in the lang attribute of the html tag.
"""
m = re_['html_lang'].search(html)
return clean_lang(m.group(1)) if m else None
def extract_title(html: str) -> Optional[str]:
"""
Extract title tags from html returning their content as a string.
"""
if not (titles := re_['title'].findall(html)):
return None
titles = [clean_html(title) for title in reversed(titles) if title]
return ' - '.join(titles).strip(' |')
def clean_page(html):
"""
Remove unwanted tags including their content from html.
Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
Also drop tags with attribute aria-hidden=true.
Return a beautiful soup.
"""
soup = BeautifulSoup(html, 'html.parser')
for tag in drop_tags:
for n in soup.find_all(tag):
n.decompose()
for n in soup.find_all(attrs={'aria-hidden': 'true'}):
n.decompose()
for role in drop_roles:
for n in soup.find_all(attrs={'rel': role}):
n.decompose()
return soup
def clean_body(body):
"""
Clean an html body.
Remove unwanted tags (keeping their content); remove empty tags;
remove and replace whitespaces in several ways.
In the end the only whitespace is a space and there are no
consecutive spaces.
"""
body = re_['strip'].sub(' ', body)
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
body = re_['whitespace'].sub(' ', body)
while re_['empty_tag'].search(body):
body = re_['empty_tag'].sub(r'\3', body)
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
body = re_['whitespace'].sub(' ', body)
body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
return body.strip().replace('\u00ad', '') # soft hyphen
def get_html_redirect(html: str) -> Optional[str]:
"""
Return an html redirect in an http-equiv meta tag.
If none is found, return None.
"""
redir_url = None
http_equivs = re_['http_equiv'].findall(html)
for raw in http_equivs:
tag = BeautifulSoup(raw, 'html.parser').meta
if tag and tag.get('http-equiv', '').lower() == 'refresh':
if content := tag.get('content'):
try:
_, redir_url = content.split(';')
redir_url = (
redir_url.strip()
.removeprefix('url=')
.removeprefix('URL=')
.strip("'")
)
except:
pass
return redir_url

View file

@ -0,0 +1,58 @@
"""
Utility functions related to http.
"""
import re
from typing import Optional
from multidict import CIMultiDictProxy
from ..models import Site
from .durl import Durl
re_ = {
'link_header': re.compile(',\s*(?=<)'),
'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
}
async def get_header_links(
headers: CIMultiDictProxy,
durl: Durl,
site: Optional[Site],
) -> dict[str, Optional[str]]:
"""
Extract canonical and shortlink links from http headers.
*durl* must be the Durl of the fetched page and *site* - i fnon None -
must be the Site to which the page belongs.
Return a (default)dict with 'canonical' and 'shortlink' as keys.
The values default to None.
"""
res = {}
canonical = shortlink = None
if 'link' in headers and (link_headers := headers.getall('link')):
links = []
for link_header in link_headers:
links += re_['link_header'].split(link_header)
url = durl.url()
base_url = site.base_url if site else url
base_durl = await Durl(base_url) if base_url else None
for link in links:
if not canonical and 'canonical' in link.lower():
if re_['rel_canonical'].search(link):
canon_url = link.strip().lstrip('<').split('>')[0]
if canon_durl := await Durl(canon_url, base=base_durl):
canonical = canon_durl.url()
if not shortlink and 'shortlink' in link.lower():
if re_['rel_shortlink'].search(link):
short_url = link.strip().lstrip('<').split('>')[0]
if short_durl := await Durl(short_url, base=base_durl):
shortlink = short_durl.url()
if canonical and shortlink:
break
res['canonical'] = canonical
res['shortlink'] = shortlink
return res

View file

@ -0,0 +1,32 @@
"""
Custom JSON encoder.
"""
import json
class JSONEncoderExt(json.JSONEncoder):
"""
Extended JSON encoder with encoding of sets as lists.
"""
def default(self, obj):
"""
Encode sets as lists and everything else as by default.
"""
if isinstance(obj, set):
return list(obj)
return json.JSONEncoder.default(self, obj)
def json_dumps(obj):
"""
Encode an object to a JSON string using JSONEncoderExt.
"""
return json.dumps(obj, cls=JSONEncoderExt)
json_loads = json.loads
"""
Decoding of JSON strings as by default.
"""

View file

@ -0,0 +1,44 @@
"""
Utility functions related to languages.
"""
from pathlib import Path
from typing import Optional
import gcld3
asset_path = Path(__file__).parent.parent / 'assets'
with open(asset_path / 'iso_639-1', 'r') as f:
iso_639_1_codes = f.read().strip().split('\n')
lang_detector = gcld3.NNetLanguageIdentifier(
min_num_bytes=0, max_num_bytes=1000
)
def clean_lang(lang: Optional[str]) -> Optional[str]:
"""
Clean a language code string: it must be an ISO 639-1 code or None.
"""
if lang is None:
return None
lang = lang[:2].lower()
if lang in iso_639_1_codes:
return lang
return None
def extract_content_language(text: str) -> Optional[str]:
"""
Extract the language from a text.
"""
if len(text) < 10:
return None
lang = None
lang_det = lang_detector.FindLanguage(text=text)
if lang_det.is_reliable:
lang = lang_det.language[:2]
return lang

View file

@ -0,0 +1,116 @@
"""
Hyperlinks (a href, link).
"""
from pathlib import Path
from typing import Optional
import tldextract
nofollow_link_rels = set(
[
'nofollow',
'search',
'noreferrer',
'noopener',
'help',
'license',
]
)
"""
Do not follow the hrefs in anchor tags with these values of the rel attribute.
"""
meta_names = (
'generator',
'lang',
'language',
'description',
'keywords',
'author',
'title',
'subject',
'revised',
'abstract',
'topic',
'summary',
'classfication',
'category',
'reply-to',
'owner',
'url',
'identifier-URL',
'geo.position',
'geo.region',
'geo.placename',
'dc.language',
)
"""
Values of the name attribute of meta tags to keep.
See also: https://gist.github.com/lancejpollard/1978404
See also: https://github.com/joshbuchea/HEAD
"""
meta_props = (
'og:site_name',
'og:locale',
'og:type',
'og:latitude',
'og:longitude',
'og:street',
'og:locality',
'og:region',
'og:postal',
'og:country',
)
"""
Values of the property attribute of meta tags to keep.
"""
link_rels = set(
[
'webmention',
'pingback',
'alternate',
'canonical',
'author',
]
)
"""
Values of the rel attribute of link tags to keep.
"""
def load_blacklist():
"""
Return the 10000 most popular internet domains.
"""
path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
with open(path, 'r') as file:
domains = file.read().strip().splitlines()
return domains
domain_blacklist = load_blacklist()
def in_blacklist(hostname: str) -> Optional[str]:
"""
Return a match of host in the blacklist, or None.
"""
domain = extract_domain(hostname)
if domain in domain_blacklist:
return hostname
return None
def extract_domain(hostname: str) -> str:
"""
Extract the lower-case domain from a hostname.
"""
levels = tldextract.extract(hostname)
return '.'.join(levels[-2:]).lower()

View file

@ -0,0 +1,120 @@
"""
Parse muse-formatted plaintext (delivered by amusewiki).
"""
import re
from datetime import datetime
from typing import Optional
from .date_finder import extract_latest_date
from .lang import clean_lang
re_tag = re.compile(r'<[^<]+?>')
def parse_muse(text: str) -> Optional[tuple[dict, str]]:
"""
Parse a MUSE string returning meta information and the text body.
"""
head, body = split_head_body(text)
if not head:
return None
meta = parse_head(head)
if not meta:
return None
return extract_muse_meta(meta, body), body
def split_head_body(text: str) -> tuple[str, str]:
"""
Split a MUSE string into head and body and return both.
"""
head = ''
while text.startswith('#'):
line_end = text.find('\n') + 1
head += text[:line_end]
text = text[line_end:]
return head.strip(), text.strip()
def parse_head(text: str) -> dict:
"""
Parse a MUSE head and return a dict mapping field names to values.
"""
fields = {}
for line in text.split('\n'):
name, value = line.strip().split(' ', 1)
fields[name[1:]] = value
return fields
amusewiki_fields = [
'author',
'title',
'lang',
'LISTtitle', # reduced title for alphabetical sorting
'subtitle',
'SORTauthors', # authors separated by ';' or ',' (only for indexing)
'SORTtopics', # topics separated by ';' or ',' (only for indexing)
'date', # publication year
'pubdate', # publication datetime
'notes', # additional info (orig title, translators, credits, ...)
'source', # preferred format: "Retrieved on March 8, 2012 from {URL}"
'publisher',
'isbn',
#'rights',
'seriesname',
'seriesnumber',
#'hyphenation', # irrelevant
#'slides', # irrelevant
#'DELETED', # irrelevant
#'cover', # irrelevant
#'coverwidth', # irrelevant
#'nocoverpage', # irrelevant
#'notoc', # irrelevant
#'nofinalpage', # irrelevant
#'impressum', # irrelevant
#'continuefootnotes', # irrelevant
#'centerchapter', # irrelevant
#'centersection', # irrelevant
]
"""
Amusewiki fields are (cf. https://amusewiki.org/library/manual)
"""
re_list = re.compile('[;,]')
def extract_muse_meta(meta, body) -> dict:
"""
Extract meta information from muse header and muse body.
"""
authors = set()
if author := meta.get('author', '').strip():
authors.add(author)
if sortauthors := meta.get('SORTauthors', '').strip():
for author in re_list.split(sortauthors):
if author_ := author.strip():
authors.add(author_)
pubdate = meta.get('pubdate').strip()
pub_date: Optional[datetime] = None
if pubdate:
try:
pub_date = datetime.fromisoformat(pubdate)
except:
pub_date = extract_latest_date(pubdate)
summary = re_tag.sub('', body[:1000].split('\n\n')[0])
return {
'title': re_tag.sub('', meta.get('title', '')) or None,
'authors': authors,
'lang': clean_lang(meta.get('lang')),
'keywords': [
s.strip()
for s in re_list.split(meta.get('SORTtopics', '').strip())
if s.strip()
],
'pub_date': pub_date,
'summary': summary,
'orig_source': meta.get('source', '').strip() or None,
}

View file

@ -0,0 +1,22 @@
"""
Utility functions for probing / sampling.
"""
def extract_samples(items, n=5):
"""
Extract up to n sample elements from the the given dict or list.
If *items* is a dict return the elements from the list of keys.
"""
l = len(items)
if l <= n:
return items
poss = []
step = (l + 1) / n
for i in range(n):
pos = int(step * i)
if pos < l and (not poss or pos > poss[-1]):
poss.append(pos)
items_list = list(items)
return [items_list[pos] for pos in poss]

View file

@ -0,0 +1,74 @@
"""
Operations on text sections.
Semantic breaks are character positions within a text (0-offset)
where a new section begins. More precisely, the character position
contains a space and only at the next position begins a tag that is
semantically breaking (e.g., a h1 or a br).
Each semantic break has a level, which means breaking strength.
The lower the level (e.g., h1 has a lower level than h2), the
stronger the break.
Implicitly, if position 0 has no semantic break, a semantic break
at position 0 with level 80 is added.
Semantic breaks can be used to split a text into sections.
The lower the maximum level of the semantic breaks taken into account,
the coarser the segmentation and the fewer the sections.
Each section is given the level of the semantic break at ist beginning.
From another point of view, sections have levels indicating
the segmentation depth.
The levels for html tags are defined in tag.py.
The *semantic_breaks* argument in the functions below
is a dict mapping the character position of the semantic break
to the level of a section beginning at this position
(if segmentation is done at this or a higher level).
"""
def iter_sections(text, semantic_breaks, max_level=59):
"""
Iterate over sections, limiting to those with a maximum level.
Yield (start_pos, end_pos, level, text).
*text* is assumed to have the first semantic break at position 0.
"""
n = len(text)
last_pos = 0
last_level = semantic_breaks.get(0, 80)
for pos, level in sorted(semantic_breaks.items()):
if level <= max_level and last_pos != pos:
yield last_pos, pos, last_level, text[last_pos + 1 : pos]
last_pos = pos
last_level = level
if last_pos < n:
yield last_pos, n, last_level, text[last_pos:]
def concat_section_texts(text, semantic_breaks, min_len=2000):
"""
Try to concat consecutive sections into chunks with a minimum length.
Yield (section_ids, combined_text).
"""
n = len(text)
last_pos = 0
section_ids = []
for section_id, pos in enumerate(semantic_breaks.keys()):
if pos >= last_pos + min_len:
if n - pos < min_len:
for id_ in [
i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
]:
section_ids.append(id_)
pos = n
yield section_ids, text[last_pos:pos]
last_pos = pos
section_ids = []
section_ids.append(section_id)
if last_pos < n:
yield section_ids, text[last_pos:]

View file

@ -0,0 +1,92 @@
"""
Text similarity with simhash.
"""
import logging
from asyncpg import Connection
from simhash import Simhash, SimhashIndex
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
postgresql_bigint_offset = 9223372036854775808
"""
Subtract this number to get a PostgreSQL bigint from a 64bit int.
"""
def get_features(txt: str) -> list[str]:
"""
Extract features from string for use with Simhash.
"""
width = 3
txt = txt.replace(' ', '').lower()
return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))]
def simhash_to_bigint(simhash: Simhash) -> int:
"""
Convert a simhash to PostgreSQL's bigint value range.
"""
return simhash.value - postgresql_bigint_offset
def simhash_from_bigint(bigint: int) -> Simhash:
"""
Convert a simhash from PostgreSQL's bigint to a Simhash instance.
"""
return Simhash(bigint + postgresql_bigint_offset, log=logger)
def get_simhash(text: str) -> Simhash:
"""
Return the Simhash of the given text.
"""
return Simhash(get_features(text), log=logger)
async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex:
"""
Return a simhash index with hashes of all stored resources of the site.
"""
sql = (
"SELECT r.id, r.simhash FROM site_path sp, resource r"
" WHERE sp.site_id=$1 AND sp.resource_id=r.id"
)
rows = await conn.fetch(sql, site_id)
objs = [
(
str(row['id']),
Simhash(row['simhash'] + postgresql_bigint_offset, log=logger),
)
for row in rows
]
return SimhashIndex(objs, k=3, log=logger)
def create_simhash(
index: SimhashIndex,
resource_id: int,
simhash_instance: Simhash,
) -> int:
"""
Add a resource with given id and simhash to a simhash index.
Return the simhash value shifted into PostgreSQL's bigint range.
(The simhash field of the resource's database entry is not updated.)
"""
index.add(str(resource_id), simhash_instance)
return simhash_to_bigint(simhash_instance)
def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]:
"""
Return the ids of similar resources from the index.
"""
found = index.get_near_dups(simhash_inst)
if found:
return sorted([int(elem) for elem in found])
return []

View file

@ -0,0 +1,189 @@
"""
Information collections related to html tags.
"""
drop_tags = [
'applet',
'area',
'audio',
'base',
'basefont',
'bdi',
'bdo',
'button',
'canvas',
'code',
'command',
'data',
'datalist',
'dir',
'embed',
'fieldset',
'figure',
'form',
'frame',
'frameset',
'iframe',
'img',
'input',
'label',
'legend',
'map',
'menuitem',
'meter',
'noframes',
'noscript',
'object',
'optgroup',
'option',
'param',
'picture',
'progress',
'rp',
'rt',
'ruby',
'samp',
'script',
'select',
'source',
'style',
'svg',
'template',
'textarea',
'track',
'var',
'video',
]
"""
Tags to drop, including their content.
"""
keep_tags = {
'a': (0, 0, ''),
'abbr': (0, 0, 'st'),
'acronym': (0, 0, 'st'),
'address': (1, 0, 'm'),
'article': (1, 15, ''),
'aside': (1, 0, 'd'),
'b': (0, 0, 'st'),
'blockquote': (1, 65, 'q'),
'br': (1, 80, ''),
'caption': (1, 68, ''),
'center': (1, 50, ''),
'cite': (1, 0, 'd'),
'col': (1, 75, ''),
'colgroup': (1, 73, ''),
'dd': (1, 70, 'li'),
'del': (0, 0, 'se'),
'details': (1, 0, 'd'),
'dfn': (0, 0, 'st'),
'div': (1, 60, ''), # lvl often revised to min of contained tags
'dl': (1, 70, 'l'),
'dt': (1, 70, 'li'),
'em': (0, 0, 'st'),
'figcaption': (1, 0, ''),
'font': (0, 0, 's'),
'footer': (1, 15, ''),
'h1': (1, 30, ''),
'h2': (1, 32, ''),
'h3': (1, 34, ''),
'h4': (1, 36, ''),
'h5': (1, 38, ''),
'h6': (1, 40, ''),
'header': (1, 15, ''),
'hr': (1, 30, ''),
'i': (0, 0, 'st'),
'ins': (0, 0, 'se'),
'li': (1, 75, 'li'), # lvl revised if not inside p
'main': (1, 10, ''),
'mark': (0, 0, 's'),
'nav': (1, 0, ''), # keep for footnotes
'ol': (1, 70, 'l'), # lvl revised if not inside p
'p': (1, 60, ''),
'pre': (1, 65, 'q'),
'q': (1, 0, 'q'),
's': (0, 0, ''),
'section': (1, 24, ''),
'small': (0, 0, 'd'),
'span': (0, 0, 's'),
'strike': (0, 0, 'se'),
'strong': (0, 0, 'st'),
'sub': (0, 0, ''),
'summary': (1, 20, 'm'),
'sup': (0, 0, ''),
'table': (1, 65, ''),
'tbody': (1, 70, ''),
'td': (1, 78, ''),
'tfoot': (1, 70, ''),
'th': (1, 75, ''),
'thead': (1, 70, ''),
'time': (0, 0, 'm'),
'tr': (1, 75, ''),
'u': (0, 0, 's'),
'ul': (1, 70, 'l'), # lvl revised if not inside p
}
"""
Tags to keep for annotation, and their properties.
The properties are:
* sep: whether to separate text at both sides of the tag with a space
* lvl: structural depth level of content of this tag;
the paragraph level is 60; headings are below 60, listings above;
a div below the tag will usually have the tag's depth + 1
* sem: semantic categories: zero or more of
* s=span
* l=listing
* i=list_item
* t=term
* e=edit
* d=details
* q=quote
* m=meta
* x=exclude
"""
self_closing_tags = ('br', 'hr')
"""
Those among keep_tags which are self-closing.
"""
all_self_closing_tags = (
'area',
'base',
'br',
'col',
'embed',
'hr',
'img',
'input',
'link',
'meta',
'param',
'source',
'track',
'wbr',
)
"""
All self-closing tags of the html standard.
"""
drop_roles = (
'banner',
'complementary',
'contentinfo',
'dialog',
'figure',
'form',
'img',
'search',
'switch',
)
"""
Drop tags with these aria roles.
"""

7
tests/__init__.py Normal file
View file

@ -0,0 +1,7 @@
from .annotation import AnnotateTest
from .date_finder import DateFinderTest
from .page import PageCleanTest
from .section import IterSectionTest, AggSectionTest
from .simhash import SimhashTest
from .text import CleanHtmlTest
from .durl import DurlTest

49
tests/annotation.py Normal file
View file

@ -0,0 +1,49 @@
"""
Test cases for resource type page.
"""
from unittest import TestCase
from atextcrawler.utils.annotation import annotate
class AnnotateTest(TestCase):
"""
Test annotation.
Consider that the <br> and <hr> tags are self-closing.
"""
def test_annotate_1(self):
s = '<em>Hello</em><br><strong>world</strong>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
self.assertEqual(anns['section_ids'], {})
def test_annotate_2(self):
s = '<em> Hello </em><br><strong> world </strong>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
self.assertEqual(anns['section_ids'], {})
def test_annotate_3(self):
s = '<p> Hello <em>world</em> </p> '
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 60})
def test_annotate_4(self):
s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 60})
self.assertEqual(anns['section_ids'], {0: ['ref1']})
def test_annotate_5(self):
s = '<br id="ref2"> Hello <p>world </p> '
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
self.assertEqual(anns['section_ids'], {1: ['ref2']})

20
tests/date_finder.py Normal file
View file

@ -0,0 +1,20 @@
from datetime import datetime
from unittest import TestCase
from atextcrawler.utils.date_finder import extract_latest_date
class DateFinderTest(TestCase):
def test_extract_latest_date(self):
s = 'test 1987-2+1-no'
r = datetime(1987, 2, 1)
self.assertEqual(extract_latest_date(s), r)
s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
r = datetime(2020, 4, 6)
self.assertEqual(extract_latest_date(s, lang='de'), r)
s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
r = datetime(2021, 1, 20)
self.assertEqual(extract_latest_date(s, lang='en'), r)
s = ''
r = None
self.assertEqual(extract_latest_date(s), r)

68
tests/durl.py Normal file
View file

@ -0,0 +1,68 @@
from unittest import IsolatedAsyncioTestCase
import asyncpg
from atextcrawler.utils.durl import Durl
from atextcrawler.config import Config
from atextcrawler.db import PGPool
class DurlTest(IsolatedAsyncioTestCase):
async def asyncSetUp(self):
config = Config().get()
self.pool = PGPool(config['postgresql'])
await self.pool.__aenter__()
self.conn = await self.pool.pool.acquire()
async def test_durl_basic(self):
durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
self.assertEqual(durl1.scheme, 'https')
self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
self.assertEqual(durl1.port, 8000)
self.assertEqual(durl1.path, '/hello')
self.assertEqual(durl1.fragment, '')
self.assertEqual(durl1.pwa(), 'hello?world')
self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
self.assertEqual(
durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
)
self.assertEqual(durl1.has_path(), True)
durl2 = await Durl('http://www.example.com/')
self.assertEqual(durl2.has_path(), False)
durl3 = await Durl('ftp://www.example.com/')
self.assertEqual(durl3, None)
async def test_durl_with_base(self):
durl1 = await Durl('https://www.example.com')
self.assertEqual(durl1.path, '/')
self.assertEqual(durl1.pwa(), '')
self.assertEqual(durl1.has_path(), False)
durl2 = await Durl('https://www.example.com/hello2', base=durl1)
self.assertEqual(durl2.hostname, 'www.example.com')
self.assertEqual(durl2.path, '/hello2')
self.assertEqual(durl2.pwa(), 'hello2')
durl3 = await Durl('/hello3?x=1', base=durl1)
self.assertEqual(durl3.hostname, 'www.example.com')
self.assertEqual(durl3.path, '/hello3')
self.assertEqual(durl3.pwa(), 'hello3?x=1')
self.assertEqual(durl3.site(), 'https://www.example.com/')
durl4 = await Durl('https://www.kernel.org/', base=durl1)
self.assertEqual(durl4, None)
async def test_durl_with_base_and_match_base(self):
durl1 = await Durl('https://www.example.com/base/path/')
self.assertEqual(durl1.path, '/base/path/')
self.assertEqual(durl1.pwa(), 'base/path/')
self.assertEqual(durl1.has_path(), True)
durl2 = await Durl(
'https://www.example.com/base/', base=durl1, match_base=True
)
self.assertEqual(durl2, None)
durl3 = await Durl(
'https://www.example.com/base/path/whatever?x=1#a',
base=durl1,
match_base=True,
)
self.assertEqual(durl3.pwa(), 'whatever?x=1')
async def asyncTearDown(self):
await self.pool.pool.release(self.conn)
await self.pool.pool.close()

24
tests/page.py Normal file
View file

@ -0,0 +1,24 @@
"""
Test cases for resource type page.
"""
from unittest import TestCase
from atextcrawler.utils.html import clean_body
# from atextcrawler.utils.tag import drop_tags
class PageCleanTest(TestCase):
def test_clean_body_1(self):
s = ' <em>Hello</em> <strong>world</strong> '
r = '<em>Hello</em> <strong>world</strong>'
self.assertEqual(clean_body(s), r)
# def test_drop_tags(self):
# s = '<figure what="ever">something<figure>else</figure>...</figure>'
# r = drop_tags(s)
# self.assertEqual(r, '')
# s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
# r = drop_tags(s)
# self.assertEqual(r, '')

105
tests/section.py Normal file
View file

@ -0,0 +1,105 @@
from unittest import TestCase
from atextcrawler.utils.section import concat_section_texts, iter_sections
class IterSectionTest(TestCase):
def test_iter_sections_1(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 80, 5: 2, 15: 1, 20: 3}
sections1 = list(iter_sections(s, sb, max_level=100))
sections2 = [
(0, 5, 80, 'bcde'),
(5, 15, 2, 'ghijklmno'),
(15, 20, 1, 'qrst'),
(20, 26, 3, 'uvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_iter_sections_2(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
sections1 = list(iter_sections(s, sb, max_level=100))
sections2 = [
(0, 5, 4, 'bcde'),
(5, 15, 2, 'ghijklmno'),
(15, 20, 1, 'qrst'),
(20, 26, 3, 'vwxyz'),
]
self.assertEqual(sections1, sections2)
def test_iter_sections_3(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {5: 2, 15: 60, 18: 50, 20: 3}
sections1 = list(iter_sections(s, sb, max_level=59))
sections2 = [
(0, 5, 80, 'bcde'),
(5, 18, 2, 'ghijklmnopqr'),
(18, 20, 50, 't'),
(20, 26, 3, 'uvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_iter_sections_4(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
sections1 = list(iter_sections(s, sb, max_level=59))
sections2 = [
(0, 5, 80, 'bcde'),
(5, 18, 2, 'ghijklmnopqr'),
(18, 20, 50, 't'),
(20, 26, 3, 'uvwxyz'),
]
self.assertEqual(sections1, sections2)
class AggSectionTest(TestCase):
def test_concat_sections_1(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 5: 1, 15: 1, 20: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghijklmno'),
([2, 3], 'pqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_2(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghij'),
([2, 3, 4], 'klmnopqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_3(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1, 2], 'abcdefghijklmnop'),
([3, 4], 'qrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_4(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 5: 1, 15: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghijklmno'),
([2, 3], 'pqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_5(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghijkl'),
([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)

54
tests/simhash.py Normal file
View file

@ -0,0 +1,54 @@
"""
Test cases for text util.
"""
from unittest import TestCase
from simhash import Simhash, SimhashIndex
from atextcrawler.utils.similarity import (
create_simhash,
get_features,
get_simhash,
postgresql_bigint_offset,
search_simhash,
)
class SimhashTest(TestCase):
"""
Test simhash creation and search.
"""
def test_search(self):
n1 = int('1111111100000000', 2)
n2 = int('1111111100000111', 2)
n3 = int('1000000000000000', 2)
n4 = int('1000000000000111', 2)
n5 = int('1000001111000000', 2)
objs = [
('1', Simhash(n1)),
('3', Simhash(n3)),
('4', Simhash(n4)),
]
index = SimhashIndex(objs, k=3)
found = search_simhash(index, Simhash(n5))
self.assertEqual(found, [])
found = search_simhash(index, Simhash(n1))
self.assertEqual(found, [1])
found = search_simhash(index, Simhash(n2))
self.assertEqual(found, [1])
found = search_simhash(index, Simhash(n4))
self.assertEqual(found, [3, 4])
def test_create(self):
index = SimhashIndex([], k=3)
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
found = search_simhash(index, simhash_1)
self.assertEqual(found, [101])
found = search_simhash(index, simhash_2)
self.assertEqual(found, [102])
simhash_3 = get_simhash('hello ' * 20 + 'X')
found = search_simhash(index, simhash_3)
self.assertEqual(found, [101])

65
tests/text.py Normal file
View file

@ -0,0 +1,65 @@
"""
Test cases for text util.
"""
from unittest import TestCase
from atextcrawler.utils.html import clean_page
class CleanHtmlTest(TestCase):
"""
Test clean_page.
Have an eye on self-closing tags (br, hr, ...).
"""
def test_clean_page_1(self):
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
r = '<em>Hello</em><br/>anything'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_2(self):
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
r = '<em>Hello</em><br/>anything'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_3(self):
# nesting
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_4(self):
# aria-hidden
s = '--<p aria-hidden=true>xx</p>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden="true">xx</p>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden=false>xx</p>..'
r = '--<p aria-hidden="false">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden="false">xx</p>..'
r = '--<p aria-hidden="false">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden=??>xx</p>..'
r = '--<p aria-hidden="??">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_5(self):
# no removal
s = '--<p>xx<em>yy</em></p>..'
r = '--<p>xx<em>yy</em></p>..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_6(self):
# self-closing tags to be removed
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_7(self):
s = '--<p rel=search>tt<area /></p>nn'
r = '--nn'
self.assertEqual(str(clean_page(s)), r)