Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
51
.gitignore
vendored
Normal file
51
.gitignore
vendored
Normal file
|
@ -0,0 +1,51 @@
|
|||
# Backup files
|
||||
*.~
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
bin/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
NOTES
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.tox/
|
||||
.coverage
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
htmlcov
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
# mypy cache
|
||||
.mypy_cache
|
||||
|
||||
# Sphinx documentation
|
||||
doc/build/
|
||||
doc/source/reference/
|
||||
|
||||
# tmp dir
|
||||
tmp/
|
30
.pre-commit-config.yaml
Normal file
30
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,30 @@
|
|||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.0.1
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 21.11b1
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/timothycrosley/isort
|
||||
rev: 5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
args: ["--profile", "black", "--filter-files", "-l", "79"]
|
||||
- repo: https://github.com/myint/autoflake
|
||||
rev: v1.4
|
||||
hooks:
|
||||
- id: autoflake
|
||||
args:
|
||||
[
|
||||
"--in-place",
|
||||
"--remove-all-unused-imports",
|
||||
"--ignore-init-module-imports",
|
||||
"--remove-unused-variables",
|
||||
]
|
46
Pipfile
Normal file
46
Pipfile
Normal file
|
@ -0,0 +1,46 @@
|
|||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
aiohttp = "*"
|
||||
async-lru = "*"
|
||||
asyncpg = "*"
|
||||
beautifulsoup4 = "*"
|
||||
elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] }
|
||||
elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" }
|
||||
feedparser = "*"
|
||||
gcld3 = "*"
|
||||
# TODO: recheck
|
||||
pypandoc = "*"
|
||||
pytidylib = "*"
|
||||
pytz = "*"
|
||||
pyyaml = "*"
|
||||
tika = "*"
|
||||
tldextract = "*"
|
||||
voluptuous = "*"
|
||||
simhash = "*"
|
||||
async-dns = "*"
|
||||
types-pyyaml = "*"
|
||||
sphinx-rtd-theme = "*"
|
||||
|
||||
[dev-packages]
|
||||
mypy = "*"
|
||||
pre-commit = "*"
|
||||
sphinx = "*"
|
||||
myst-parser = "*"
|
||||
isort = "*"
|
||||
blacken-docs = "*"
|
||||
pybetter = "*"
|
||||
interrogate = "*"
|
||||
autoflake = "*"
|
||||
types-pyyaml = "*"
|
||||
types-pytz = "*"
|
||||
black = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
1561
Pipfile.lock
generated
Normal file
1561
Pipfile.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
13
README.md
Normal file
13
README.md
Normal file
|
@ -0,0 +1,13 @@
|
|||
atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search.
|
||||
|
||||
Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch)
|
||||
|
||||
atextcrawler crawls and indexes selected websites.
|
||||
It starts from a few seed sites and follows their external links.
|
||||
Criteria defined in plugin code determine which linked sites (and
|
||||
which of their resources) are (recursively) added to the pool.
|
||||
|
||||
atextcrawler is written in Python, runs a configurable number of
|
||||
async workers concurrently (in one process), uses tensorflow for
|
||||
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||
and stores metadata in PostgreSQL and texts in elasticsearch.
|
20
doc/Makefile
Normal file
20
doc/Makefile
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
71
doc/source/conf.py
Normal file
71
doc/source/conf.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
|
||||
import os
|
||||
import sys
|
||||
|
||||
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
|
||||
sys.path.insert(0, proj_dir + '/src')
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'atextcrawler'
|
||||
copyright = '2021, ibu radempa'
|
||||
author = 'ibu radempa'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.1.0'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.autosummary',
|
||||
'myst_parser',
|
||||
'sphinx.ext.graphviz',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
autosummary_generate = True
|
||||
|
||||
source_suffix = {
|
||||
'.rst': 'restructuredtext',
|
||||
'.md': 'markdown',
|
||||
}
|
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Initial URLs (first run only)
|
||||
#
|
||||
# To whitelist a URL prepend '+', to blacklist prepend '-'.
|
||||
# Comment lines must begin with '#'.
|
||||
|
||||
# de
|
||||
+http://agd.blogsport.de/
|
||||
+https://blackblogs.org/blogs/
|
||||
+https://fau.org/
|
||||
+http://anarchiv.de/
|
||||
+http://olaf.bbm.de/die-aktion
|
||||
-https://www.anarchistischefoderation.de/
|
||||
|
||||
# en
|
||||
+https://anarchistarchivist.com/
|
||||
+https://bookshelf.theanarchistlibrary.org/library/
|
||||
+https://archive.elephanteditions.net/library/
|
||||
+https://blackrosefed.org/
|
||||
+https://alpineanarchist.org/
|
||||
+https://nostate.net/
|
||||
+https://abolishing.blackblogs.org/
|
||||
+http://library.nothingness.org/
|
||||
-https://www.anarchistfederation.net/
|
88
doc/source/config_template/main.yaml
Normal file
88
doc/source/config_template/main.yaml
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Name of this instance
|
||||
# Default value: atextcrawler
|
||||
# Allowed values: arbitrary string
|
||||
instance_name: atextcrawler
|
||||
|
||||
# Which kind of instance is this?
|
||||
# Default value: prod
|
||||
# Allowed values are:
|
||||
# - 'dev': development instance
|
||||
# - 'staging': staging instance
|
||||
# - 'prod': production instance
|
||||
instance_type: prod
|
||||
|
||||
# Log level
|
||||
# Default value: info
|
||||
# Allowed values: critical, error, warning, info, debug
|
||||
log_level: info
|
||||
|
||||
# Plugins directory
|
||||
# If given as relative path, it will be relative to the
|
||||
# directory of this file (main.yaml).
|
||||
# Read documentation on plugins.
|
||||
# Default value: plugins
|
||||
# Hint: Create a empty __init__.py in the plugins_dir.
|
||||
plugins_dir: plugins
|
||||
|
||||
# Parameters for access to the PostgreSQL service
|
||||
# No default values; must be set.
|
||||
postgresql:
|
||||
host: localhost
|
||||
port: 5432
|
||||
database: atextcrawler
|
||||
user: atextcrawler
|
||||
password: ________________________
|
||||
|
||||
# Crawling
|
||||
crawl:
|
||||
# Number of concurrent workers
|
||||
# Default value: 10
|
||||
# Allowed values: integer >=0 and <=1000
|
||||
#workers: 3
|
||||
|
||||
# Delay in seconds between attempts to fetch items
|
||||
# from site_queue if the last attempt gave no item
|
||||
# Also the delay in seconds after a worker has found
|
||||
# no site to process
|
||||
# Default value: 600
|
||||
# Allowed values: positive number
|
||||
#site_delay: 10
|
||||
|
||||
# Time interval in seconds between site updates when
|
||||
# handling queued base URLs
|
||||
# Default value: 3600
|
||||
# Allowed values: positive number
|
||||
#site_revisit_interval: 3600
|
||||
|
||||
# Delay in seconds between attempts to process
|
||||
# individual resources (pages etc.) of a site
|
||||
# Default value: 5
|
||||
# Allowed values: positive number
|
||||
#resource_delay: 3
|
||||
|
||||
# Default interval in seconds between full crawls of a site
|
||||
# Default value: 864000 (10 days)
|
||||
# Allowed values: positive number
|
||||
#full_crawl_interval: 864000
|
||||
|
||||
# Default interval in seconds between feed crawls of a site
|
||||
# Default value: 86400 (1 day)
|
||||
# Allowed values: positive number
|
||||
#feed_crawl_interval: 86400
|
||||
|
||||
# Parameters for access to the ElasticSearch service
|
||||
# No default values; must be set.
|
||||
elasticsearch:
|
||||
# host on which ES is running
|
||||
host: localhost
|
||||
# API key for accessing ES
|
||||
api_key: "**********************"
|
||||
# API user id
|
||||
id: "**********************"
|
||||
# Index base name (actual index names will have '_text' etc. appended)
|
||||
index_base_name: atext
|
||||
|
||||
# Tensorflow access
|
||||
tensorflow:
|
||||
# The prediction endpoint of the model server's sentence model
|
||||
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|
0
doc/source/config_template/plugins/__init__.py
Normal file
0
doc/source/config_template/plugins/__init__.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Filter paths found in a resource.
|
||||
|
||||
This plugin implements :func:`rp_filter`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def rp_filter(site, durl) -> Optional[str]:
|
||||
"""
|
||||
Adjust or filter found paths (may depend on site).
|
||||
|
||||
To filter out a path (i.e., not add it to table `site_path`)
|
||||
return None.
|
||||
"""
|
||||
path = durl.pwa()
|
||||
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||
return None
|
||||
path = path.removesuffix('?amp=1')
|
||||
return path
|
47
doc/source/config_template/plugins/filter_site.py
Normal file
47
doc/source/config_template/plugins/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Relevance estimation of sites.
|
||||
|
||||
This plugin implements :func:`site_filter`.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from atextcrawler.models import Site
|
||||
|
||||
MIN_RELEVANCE_SCORE = 5
|
||||
|
||||
|
||||
async def site_filter(site: Site) -> bool:
|
||||
"""
|
||||
Assess relevance of the site (using language-dependent criteria).
|
||||
|
||||
If the site shall be crawled, return True, else False.
|
||||
"""
|
||||
# limit to sites in English or German language
|
||||
if not set(['de', 'en']) & set(site.langs):
|
||||
return False
|
||||
score = 0.0
|
||||
for crit_name, weight, langs, crit_re in re_criteria:
|
||||
if '*' in langs or set(langs) & set(site.langs):
|
||||
findings = crit_re.findall(site.startpage_text)
|
||||
if findings:
|
||||
score += weight * len(findings)
|
||||
if site.title and crit_re.search(site.title):
|
||||
score += 4 * weight
|
||||
if site.description and crit_re.search(site.description):
|
||||
score += 4 * weight
|
||||
|
||||
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||
|
||||
return score >= MIN_RELEVANCE_SCORE
|
||||
|
||||
|
||||
re_criteria = {
|
||||
(
|
||||
'anarch',
|
||||
1.0,
|
||||
('*',),
|
||||
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||
),
|
||||
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||
}
|
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Plugin for filtering paths of a site to be retrieved.
|
||||
|
||||
This plugin implements :func:`sp_filter`.
|
||||
"""
|
||||
|
||||
|
||||
def sp_filter(site, path, robots) -> bool:
|
||||
"""
|
||||
Per-site path filter. Return whether the path shall be retrieved.
|
||||
"""
|
||||
if not robots.can_fetch_url(site.base_url + path):
|
||||
return False
|
||||
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||
if any(
|
||||
[
|
||||
path.endswith(end)
|
||||
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||
]
|
||||
):
|
||||
return False
|
||||
if '/bbselect?' in path:
|
||||
return False
|
||||
return True
|
63
doc/source/devel/devel.md
Normal file
63
doc/source/devel/devel.md
Normal file
|
@ -0,0 +1,63 @@
|
|||
## Setup dev environment
|
||||
1. You need python 3.9 or later.
|
||||
1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
|
||||
1. Clone the repo and setup a virtualenv:
|
||||
```
|
||||
cd YOUR_DEV_DIR
|
||||
git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
|
||||
cd atextcrawler
|
||||
pipenv install -d
|
||||
```
|
||||
|
||||
## Configure the instance
|
||||
See [installation](installation.md).
|
||||
|
||||
## Run
|
||||
```
|
||||
python -m atextcrawler
|
||||
```
|
||||
|
||||
## Logging
|
||||
Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
|
||||
```
|
||||
journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
|
||||
```
|
||||
|
||||
## Upgrading
|
||||
Upgrade dev tools:
|
||||
```
|
||||
pre-commit autoupdate
|
||||
```
|
||||
|
||||
## Test and clean manually
|
||||
```
|
||||
AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
|
||||
mypy --ignore-missing-imports src/atextcrawler
|
||||
isort src/atextcrawler
|
||||
black -S -t py37 -l 79 src/atextcrawler
|
||||
pybetter --exclude B004,B007,B008 src/atextcrawler
|
||||
interrogate -i -I -m -v src/atextcrawler
|
||||
```
|
||||
|
||||
## Release
|
||||
There are no releases (currently).
|
||||
|
||||
## Useful commands
|
||||
|
||||
### Fetch a resource or a site manually
|
||||
```
|
||||
python -m atextcrawler.resource https://www.katesharpleylibrary.net/
|
||||
python -m atextcrawler.site https://www.katesharpleylibrary.net/
|
||||
```
|
||||
|
||||
### SQL
|
||||
```
|
||||
drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
|
||||
|
||||
http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
|
||||
|
||||
http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
|
||||
|
||||
-- stats: sites, paths, resources
|
||||
select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
|
||||
```
|
64
doc/source/devel/related_work.md
Normal file
64
doc/source/devel/related_work.md
Normal file
|
@ -0,0 +1,64 @@
|
|||
## Related work
|
||||
* [collection of crawlers](https://github.com/adbar/awesome-crawler)
|
||||
* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
|
||||
|
||||
### crawlers
|
||||
* [acrawler](https://acrawler.readthedocs.io/en/latest/)
|
||||
* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
|
||||
* [repo](https://github.com/adbar/trafilatura)
|
||||
* [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
|
||||
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
|
||||
* [scrapy](https://docs.scrapy.org/en/latest/)
|
||||
* [heritrix3](https://github.com/internetarchive/heritrix3/)
|
||||
* [YaCy](https://yacy.net/)
|
||||
* [searchmysite](https://searchmysite.net/)
|
||||
* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
|
||||
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
|
||||
* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
|
||||
* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
|
||||
|
||||
#### general
|
||||
* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
|
||||
|
||||
### sitemap parsers
|
||||
* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
|
||||
|
||||
### url handling
|
||||
* [courlan](https://pypi.org/project/courlan/)
|
||||
|
||||
### language detection
|
||||
* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
|
||||
* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
|
||||
* [guess_language](https://pypi.org/project/guess-language/)
|
||||
* [cld3](https://github.com/google/cld3)
|
||||
|
||||
### text extraction
|
||||
* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
|
||||
|
||||
### deduplication
|
||||
* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
|
||||
* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
|
||||
* remove paragraphs with more than 50% word-7-tuples encountered previously
|
||||
|
||||
### Extract more meta tags
|
||||
* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
|
||||
https://support.shareaholic.com/hc/en-us/articles/115003085186
|
||||
|
||||
### Date parsing dependent on language
|
||||
* https://en.wikipedia.org/wiki/Date_format_by_country
|
||||
* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
|
||||
* https://pypi.org/project/dateparser/
|
||||
* https://github.com/ovalhub/pyicu
|
||||
* https://github.com/night-crawler/cldr-language-helpers
|
||||
* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
|
||||
|
||||
ICU
|
||||
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
|
||||
* https://gist.github.com/dpk/8325992
|
||||
* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
|
||||
* https://unicode-org.github.io/icu/userguide/
|
||||
* https://unicode-org.github.io/icu-docs/#/icu4c/
|
||||
* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
|
||||
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
|
||||
* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
|
||||
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
|
77
doc/source/devel/todo.md
Normal file
77
doc/source/devel/todo.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
## TODO
|
||||
|
||||
* parse html time tags
|
||||
|
||||
* site annotations:
|
||||
* categories
|
||||
* historical (no changes any more since n months)
|
||||
* news
|
||||
* local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
|
||||
|
||||
* allow for tls in elasticsearch config
|
||||
|
||||
* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
|
||||
```
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'„': '"',
|
||||
'″': '"',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'„': '"',
|
||||
'″': '"',
|
||||
'“':'"',
|
||||
'”':'"',
|
||||
'„':'"',
|
||||
'″':'"',
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
```
|
||||
* normalize quotation marks and punctuation in general
|
||||
* https://unicode-table.com/en/sets/quotation-marks/
|
||||
* https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
|
||||
* https://www.fileformat.info/info/unicode/category/Po/list.htm
|
||||
* https://www.gaijin.at/en/infos/unicode-character-table-punctuation
|
||||
* ⁝
|
||||
|
||||
* cancel crawls that take too long
|
||||
|
||||
* search for "TODO" in code
|
||||
|
||||
* feedparser has support for JSON feeds since commit
|
||||
a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
|
||||
(as of 2020-10-26 in "develop" branch, not part of a release)
|
||||
the version names are 'json1' and 'json11'
|
||||
|
||||
* allow site URLs with path, e.g.
|
||||
https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
|
||||
|
||||
* add more languages
|
||||
|
||||
## Ideas
|
||||
* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
|
||||
|
||||
* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
|
||||
* [langid.py](https://github.com/saffsd/langid.py)
|
||||
|
||||
* [gain](https://github.com/gaojiuli/gain)
|
||||
* [ruia](https://docs.python-ruia.org/)
|
||||
* [demiurge](https://demiurge.readthedocs.io/)
|
||||
* [cocrawler](https://github.com/cocrawler/cocrawler/)
|
||||
* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
|
9
doc/source/development.rst
Normal file
9
doc/source/development.rst
Normal file
|
@ -0,0 +1,9 @@
|
|||
Development
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
devel/devel
|
||||
devel/todo
|
||||
devel/related_work
|
119
doc/source/elasticsearch.md
Normal file
119
doc/source/elasticsearch.md
Normal file
|
@ -0,0 +1,119 @@
|
|||
# Howto elasticsearch
|
||||
|
||||
## Prerequisites
|
||||
On the host (virtualization host) we need:
|
||||
```
|
||||
# cat /etc/sysctl.d/virtual_memory.conf
|
||||
vm.max_map_count=262144
|
||||
# sysctl -p /etc/sysctl.d/virtual_memory.conf
|
||||
```
|
||||
|
||||
If this cannot be done, change this file after installing or upgrading elasticsearch:
|
||||
```
|
||||
/usr/lib/sysctl.d/elasticsearch.conf
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### Install package
|
||||
In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
|
||||
|
||||
We do a manual install. If you configure the apt repo instead, also think about setting
|
||||
`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
|
||||
|
||||
```
|
||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
|
||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
|
||||
shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
|
||||
dpkg -i elasticsearch-7.15.2-amd64.deb
|
||||
systemctl daemon-reload
|
||||
systemctl enable elasticsearch.service
|
||||
systemctl start elasticsearch.service
|
||||
```
|
||||
|
||||
First test:
|
||||
```
|
||||
http -j GET 127.0.0.1:9200/
|
||||
```
|
||||
|
||||
### Storage
|
||||
|
||||
```
|
||||
systemctl stop elasticsearch.service
|
||||
mv /var/lib/elasticsearch/ /srv/
|
||||
systemctl start elasticsearch.service
|
||||
```
|
||||
|
||||
Edit /etc/elasticsearch/elasticsearch.yml
|
||||
```
|
||||
cluster.name: org.a-text.search
|
||||
node.name: atext1
|
||||
path.data: /srv/elasticsearch
|
||||
path.logs: /var/log/elasticsearch
|
||||
discovery.seed_hosts: ["atext1.multiname.org"]
|
||||
xpack.security.enabled: true
|
||||
xpack.security.authc.api_key.enabled: true
|
||||
```
|
||||
|
||||
```
|
||||
systemctl restart elasticsearch
|
||||
```
|
||||
|
||||
The logfile now is at
|
||||
```
|
||||
/var/log/elasticsearch/org.a-text.search.log
|
||||
```
|
||||
|
||||
### Setup passwords
|
||||
Setup passwords:
|
||||
```
|
||||
# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
|
||||
Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
|
||||
The passwords will be randomly generated and printed to the console.
|
||||
Please confirm that you would like to continue [y/N]y
|
||||
```
|
||||
|
||||
Copy output to /etc/elasticsearch/passwords and
|
||||
```
|
||||
chmod 400 /etc/elasticsearch/passwords
|
||||
```
|
||||
|
||||
Check login as user elastic:
|
||||
```
|
||||
http --auth elastic:************** -j GET http://127.0.0.1:9200/
|
||||
```
|
||||
|
||||
### Memory limitation
|
||||
To limit memory usage
|
||||
```
|
||||
mkdir /etc/systemd/system/elasticsearch.service.d
|
||||
cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
|
||||
[Service]
|
||||
LimitMEMLOCK=8G
|
||||
|
||||
systemctl stop elasticsearch
|
||||
systemctl daemon-reload
|
||||
systemctl start elasticsearch
|
||||
EOF
|
||||
```
|
||||
and restart the service.
|
||||
|
||||
## Usage
|
||||
Some useful requests:
|
||||
|
||||
### List indices
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
|
||||
```
|
||||
### Health
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
|
||||
```
|
||||
### Node attributes
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
|
||||
```
|
||||
### Create API key
|
||||
```
|
||||
http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
|
||||
```
|
37
doc/source/index.rst
Normal file
37
doc/source/index.rst
Normal file
|
@ -0,0 +1,37 @@
|
|||
atextcrawler
|
||||
============
|
||||
|
||||
atextcrawler is an asynchronous webcrawler indexing text
|
||||
for literal and semantic search.
|
||||
|
||||
Its client-side counterpart is atextsearch_.
|
||||
|
||||
atextcrawler crawls and indexes selected websites.
|
||||
It starts from a few seed sites and follows their external links.
|
||||
Criteria defined in plugin code determine which linked sites (and
|
||||
which of their resources) are (recursively) added to the pool.
|
||||
|
||||
atextcrawler is written in Python, runs a configurable number of
|
||||
async workers concurrently (in one process), uses tensorflow for
|
||||
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||
and stores metadata in PostgreSQL and texts in elasticsearch.
|
||||
|
||||
.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
introduction
|
||||
installation
|
||||
maintenance
|
||||
development
|
||||
reference/modules
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
122
doc/source/installation.md
Normal file
122
doc/source/installation.md
Normal file
|
@ -0,0 +1,122 @@
|
|||
# Installation
|
||||
Installation was only tested on Debian bullseye (on amd64).
|
||||
The instructions below are for this system.
|
||||
(Please adapt to other environments.)
|
||||
|
||||
## System packages
|
||||
```
|
||||
apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
|
||||
```
|
||||
The protobuf packages are required for python package gcld3 (see below).
|
||||
|
||||
## PostgreSQL database
|
||||
We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
|
||||
```
|
||||
createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
|
||||
```
|
||||
|
||||
## Elasticsearch
|
||||
We need access to an elasticsearch instance (over TCP/IP).
|
||||
|
||||
Note: TLS is not yet supported, so install this service locally.
|
||||
|
||||
See [elasticsearch howto](elasticsearch.md).
|
||||
|
||||
## Tensorflow model server
|
||||
We need access to a tensorflow model server (over TCP/IP).
|
||||
It should serve `universal_sentence_encoder_multilingual`
|
||||
or a similar language model.
|
||||
|
||||
Note: TLS is not yet supported, so install this service locally.
|
||||
|
||||
See [tensorflow howto](tensorflow_model_server.md).
|
||||
|
||||
## Setup virtualenv and install atextcrawler
|
||||
```
|
||||
apt install python3-pip
|
||||
adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
|
||||
su - atextcrawler
|
||||
cat >>.bashrc <<EOF
|
||||
export PYTHONPATH=\$HOME/repo/src
|
||||
EOF
|
||||
pip3 install --user pipenv
|
||||
cat >>.profile <<EOF
|
||||
PYTHONPATH=\$HOME/repo/src
|
||||
PATH=\$HOME/.local/bin:$PATH
|
||||
\$HOME/.local/bin/pipenv shell
|
||||
EOF
|
||||
exit
|
||||
su - atextcrawler
|
||||
git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
|
||||
cd repo
|
||||
pipenv sync
|
||||
pipenv install --site-packages # for systemd
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
Note: One of the dependencies, Python package `tldextract`,
|
||||
uses this directory for caching:
|
||||
```
|
||||
$HOME/.cache/python-tldextract/
|
||||
```
|
||||
|
||||
## Configure atextcrawler
|
||||
As user `atextcrawler` execute
|
||||
```
|
||||
mkdir $HOME/.config
|
||||
cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
|
||||
```
|
||||
|
||||
Edit `$HOME/.config/atextcrawler/main.yaml`.
|
||||
|
||||
If you want to override a plugin, copy it to the plugins directory
|
||||
and edit it, e.g.
|
||||
```
|
||||
cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
|
||||
```
|
||||
|
||||
Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
|
||||
|
||||
Check (and print) the instance configuration:
|
||||
```
|
||||
python -m atextcrawler.config
|
||||
```
|
||||
|
||||
## Test run
|
||||
To see if it works, run `atextcrawler` from the command line:
|
||||
```
|
||||
python -m atextcrawler
|
||||
```
|
||||
You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
|
||||
|
||||
## Install systemd service
|
||||
To make the service persistent, create a systemd unit file
|
||||
`/etc/systemd/system/atextcrawler.service` with this content:
|
||||
```
|
||||
[Unit]
|
||||
Description=atextcrawler web crawler
|
||||
Documentation=https://gitea.multiname.org/a-text/atextcrawler
|
||||
Requires=network.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=atextcrawler
|
||||
Group=atextcrawler
|
||||
WorkingDirectory=/srv/atextcrawler/repo
|
||||
Environment=PYTHONPATH=/srv/atextcrawler/repo/src
|
||||
ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
|
||||
TimeoutStartSec=30
|
||||
ExecStop=/bin/kill -INT $MAINPID
|
||||
TimeoutStopSec=180
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
and
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable atextcrawler
|
||||
systemctl start atextcrawler
|
||||
```
|
66
doc/source/introduction.md
Normal file
66
doc/source/introduction.md
Normal file
|
@ -0,0 +1,66 @@
|
|||
# Introduction
|
||||
|
||||
## What atextcrawler does:
|
||||
* Start from a seed (white+black-)list of website base URLs
|
||||
* Loop over sites selected by applying criteria to the content
|
||||
of the site's start page
|
||||
* Crawl the site, i.e. loop over resources of the site
|
||||
* Extract plaintext content from the resource (html parsing is
|
||||
optimized for html5); discard non-text content, but handle feeds
|
||||
and sitemaps
|
||||
* Extract internal and external links; external links contribute
|
||||
to the site list
|
||||
* Keep track of the sites and resources in a PostgreSQL database
|
||||
* Store plaintext content of resources in an Elasticsearch index
|
||||
* Store vector embeddings of plaintexts also in Elasticsearch
|
||||
using tensorflow model server with a multilingual language model
|
||||
|
||||
## Architecture
|
||||
There is only one python process running concurrently.
|
||||
We use asyncio where possible (almost everywhere).
|
||||
|
||||
1. There is a queue of websites, see database table `site_queue`.
|
||||
The queue is fed a) on first startup with seeds, b) manually
|
||||
and c) from crawls which find external links.
|
||||
When the queued is handled new sites are stored to table `site`.
|
||||
New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
|
||||
After the queue has been handled there is a delay
|
||||
(`crawl.site_delay` seconds) before repetition.
|
||||
1. Updating a site means: the start page is fetched and
|
||||
criteria are applied to its content to determine whether
|
||||
the site is relevant. (It is assumed that (non-)relevance is
|
||||
obvious from the start page already.) If the site is relevant,
|
||||
more information is fetched (e.g. sitemaps).
|
||||
1. There is s a configurable number of crawler workers (config
|
||||
`crawl.workers`) which concurrently crawl sites, one at a time
|
||||
per worker. (During the crawl the site is marked as locked using
|
||||
crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
|
||||
Each crawl (with begin time, end time, number of found (new)
|
||||
resources)) is stored in table `crawl`.
|
||||
1. Crawls are either full crawls (including all paths reachable
|
||||
through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
|
||||
Feed crawls can happen more frequently (e.g. daily).
|
||||
1. When a path is fetched it can result in a MetaResource (feed or
|
||||
sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
|
||||
1. If a MetaResource is fetched and it is a sitemap, its paths are
|
||||
added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
|
||||
1. Links between sites are stored in table `site_link`.
|
||||
|
||||
## Site annotations
|
||||
Database table `site_annotation` can have any number of annotations
|
||||
for a base_url. While crawling, these annotations are considered:
|
||||
Blacklisting or whitelisting has precedence over function `site_filter`
|
||||
(in plugin `filter_site`).
|
||||
|
||||
Annotations cannot be managed from within atextcrawler;
|
||||
this requires another application, usually [`atextsearch`](https://TODO).
|
||||
|
||||
Each annotation requires a base_url of the annotated site and
|
||||
if a site with this base_url exists in the `site` table,
|
||||
it should also be associated with the site's id (column `site_id`).
|
||||
|
||||
## Limitations
|
||||
* atextcrawler is not optimized for speed; it is meant to be run as a
|
||||
background task on a server with limited resources
|
||||
(or even an SBC, like raspberry pi, with attached storage)
|
||||
* atextcrawler only indexes text, no other resources like images
|
23
doc/source/maintenance.md
Normal file
23
doc/source/maintenance.md
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Maintenance
|
||||
|
||||
## Upgrading
|
||||
```
|
||||
su - atextcrawler
|
||||
pip3 install --user --upgrade pipenv
|
||||
cd repo
|
||||
git pull
|
||||
pipenv sync
|
||||
systemctl restart atextcrawler
|
||||
```
|
||||
|
||||
## Update tldextract
|
||||
From time to time run (in the Python virtualenv):
|
||||
```
|
||||
tldextract --update
|
||||
```
|
||||
or
|
||||
```
|
||||
systemctl stop atextcrawler
|
||||
rm -r $HOME/.cache/python-tldextract
|
||||
systemctl start atextcrawler
|
||||
```
|
98
doc/source/tensorflow_model_server.md
Normal file
98
doc/source/tensorflow_model_server.md
Normal file
|
@ -0,0 +1,98 @@
|
|||
# Tensorflow model server
|
||||
|
||||
## Setup server
|
||||
Prepare:
|
||||
```
|
||||
apt install gnupg2
|
||||
```
|
||||
Add repo:
|
||||
```
|
||||
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
|
||||
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
|
||||
```
|
||||
Install package:
|
||||
```
|
||||
apt update
|
||||
apt install tensorflow-model-server
|
||||
```
|
||||
|
||||
## Setup models
|
||||
```
|
||||
mkdir -p /srv/tensorflow/workdir
|
||||
mkdir -p /srv/tensorflow/models
|
||||
```
|
||||
Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
|
||||
```
|
||||
# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
|
||||
mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||
cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||
wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
|
||||
tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
|
||||
rm universal-sentence-encoder-multilingual_3.tar.gz
|
||||
```
|
||||
|
||||
Check:
|
||||
```
|
||||
tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
|
||||
```
|
||||
|
||||
Config file `/srv/tensorflow/config`:
|
||||
```
|
||||
model_config_list: {
|
||||
config: {
|
||||
name: "sentences",
|
||||
base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
|
||||
model_platform: "tensorflow"
|
||||
model_version_policy: {latest{}},
|
||||
},
|
||||
config: {
|
||||
... (next model)
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
## Systemd integration
|
||||
Edit /etc/systemd/system/tensorflow.service
|
||||
```
|
||||
[Unit]
|
||||
Description=tensorflow model server
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/srv/tensorflow/workdir
|
||||
ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
RestartSec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
and
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable tensorflow
|
||||
systemctl start tensorflow
|
||||
```
|
||||
|
||||
Check:
|
||||
```
|
||||
http -j GET http://localhost:9000/v1/models/sentences
|
||||
```
|
||||
|
||||
## Usage
|
||||
Show model details:
|
||||
```
|
||||
http -j GET http://localhost:9000/v1/models/sentences/metadata
|
||||
```
|
||||
|
||||
## Docs
|
||||
|
||||
* `/usr/bin/tensorflow_model_server --help`
|
||||
* https://github.com/tensorflow/serving/
|
||||
* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
|
||||
* https://github.com/hey-car/tensorflow-model-server
|
||||
|
||||
Datasets:
|
||||
* https://www.tensorflow.org/datasets/catalog/overview
|
48
license.txt
Normal file
48
license.txt
Normal file
|
@ -0,0 +1,48 @@
|
|||
ANTI-AUTHORITARIAN LICENSE version 1.0
|
||||
________________________________________________________________________________
|
||||
|
||||
Obviously, this license is relevant to all who are bound by law.
|
||||
|
||||
The licensee ("you") must not be a commercial, military, clerical or
|
||||
governmental entity. For this license the term "software" means the program
|
||||
code, documentation as well as other data (for instance, language files).
|
||||
|
||||
Subject to the respective terms and conditions described below the licensee
|
||||
is granted the non-exclusive and non-transferable license to:
|
||||
A. make copies of the software
|
||||
B. create derivative works ("modifications")
|
||||
C. install and run copies or modifications of the software on any number of
|
||||
servers, thereby making them usable for the licensee and possibly others
|
||||
D. offer or give copies or modifications of the software, or parts of the
|
||||
unmodified or modified software to others
|
||||
|
||||
For these permissions the respective conditions stated below must be met:
|
||||
* For permission A condition 1 must be met.
|
||||
* For permission B all of the conditions 1, 3, 4 must be met.
|
||||
* For permission C all of the conditions 2, 3 must be met.
|
||||
* For permission D all of the conditions 1, 2, 3, 4, 5 must be met.
|
||||
|
||||
These are the conditions:
|
||||
1. You include this copyright notice and license in any copy or modification.
|
||||
In files that contain a reference to it you preserve this reference.
|
||||
2. You do not use this software or any modification of it for any commercial
|
||||
purpose or for monetary gain, and also not for any military, governmental
|
||||
or religious purpose; here with commercial purpose we mean activities which
|
||||
have among their goals to make profit, be it monetary profit or any other
|
||||
kind of profit that may entail or contribute to monetary profit.
|
||||
3. Demos or screenshots of the modified or unmodified software must not be
|
||||
published in any medium which requires the viewers to pay money in order
|
||||
to see the contents; here money paid for mere internet connectivity (i.e.,
|
||||
independent of the content supplier) is to be disregarded.
|
||||
4. You do not impose any further restrictions on this software or any
|
||||
derivative works beyond those restrictions herein.
|
||||
5. The copy or modification must include source code, and must allow
|
||||
distribution in source code as well as compiled form. The source code
|
||||
must be the preferred form in which a programmer would modify the program.
|
||||
Deliberately obfuscated source code is not allowed. Intermediate forms
|
||||
such as the output of a preprocessor or translator are not allowed.
|
||||
|
||||
For this license itself, if re-used for other software, the following
|
||||
copyright and license applies (copyheart license):
|
||||
|
||||
♡ Copying is an act of love. Please copy.
|
10
pyproject.toml
Normal file
10
pyproject.toml
Normal file
|
@ -0,0 +1,10 @@
|
|||
# TOML formatted file; see PEP 518
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
#multi_line_output = 3
|
||||
|
||||
[tool.black]
|
||||
line-length = 79
|
||||
target_version = ['py39']
|
||||
skip-string-normalization = true
|
0
src/atextcrawler/__init__.py
Normal file
0
src/atextcrawler/__init__.py
Normal file
12
src/atextcrawler/__main__.py
Normal file
12
src/atextcrawler/__main__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
"""
|
||||
atextcrawler application execution entry point.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from .application import Application
|
||||
from .config import Config
|
||||
|
||||
if __name__ == '__main__':
|
||||
config = Config().get()
|
||||
asyncio.run(Application(config).run())
|
204
src/atextcrawler/application.py
Normal file
204
src/atextcrawler/application.py
Normal file
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
atextcrawler application.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
|
||||
from systemd.journal import JournalHandler
|
||||
|
||||
from .config import Config
|
||||
from .crawl import CrawlWorker
|
||||
from .db import PGPool
|
||||
from .search import shutdown_engine, startup_engine
|
||||
from .site import load_seeds, process_site_queue
|
||||
|
||||
plugin_names = ['filter_site', 'filter_site_path', 'filter_resource_path']
|
||||
|
||||
|
||||
class Application:
|
||||
"""
|
||||
atextcrawler application.
|
||||
|
||||
The basic structure of the application is this:
|
||||
* one site crawler works just on the site_queue: fetching start pages
|
||||
of sites and storing updated site information in table sites
|
||||
* N other CrawlWorkers each do this in a loop:
|
||||
checkout a site that is due for crawl and crawl its resources;
|
||||
they fill the site_queue
|
||||
"""
|
||||
|
||||
running = True
|
||||
|
||||
def __init__(self, config=None):
|
||||
if config is None:
|
||||
config = Config().get()
|
||||
self.config = config
|
||||
self.instance_name = config['instance_name']
|
||||
self.instance_type = config['instance_type']
|
||||
log_level = getattr(
|
||||
logging, config['log_level'].upper(), logging.CRITICAL
|
||||
)
|
||||
self.logger = logging.getLogger('atextcrawler')
|
||||
self.logger.setLevel(log_level)
|
||||
if self.instance_type == 'dev':
|
||||
self.logger.addHandler(logging.StreamHandler())
|
||||
else:
|
||||
self.logger.addHandler(
|
||||
JournalHandler(SYSLOG_IDENTIFIER=self.instance_name)
|
||||
)
|
||||
self.logger.propagate = False
|
||||
self.channel = 'atextcrawler_' + self.config['instance_name']
|
||||
msg = f'Instance "{self}" initializing'
|
||||
self.logger.info(msg)
|
||||
self.plugins = self._load_plugins()
|
||||
|
||||
def __str__(self):
|
||||
return self.instance_name
|
||||
|
||||
def _load_plugins(self):
|
||||
"""
|
||||
Return a dict mapping plugin names to modules.
|
||||
"""
|
||||
modules = {}
|
||||
old_path = sys.path
|
||||
for name in plugin_names:
|
||||
try:
|
||||
plugins_dir = self.config['plugins_dir']
|
||||
sys.path.insert(0, plugins_dir)
|
||||
module = importlib.import_module(name)
|
||||
msg = f'Loading plugin "{name}" from {plugins_dir}'
|
||||
except:
|
||||
module = importlib.import_module(
|
||||
'atextcrawler.plugin_defaults.' + name
|
||||
)
|
||||
msg = f'Loading plugin "{name}" from default location'
|
||||
self.logger.info(msg)
|
||||
modules[name] = module
|
||||
sys.path = old_path
|
||||
return modules
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Application lifecycle.
|
||||
"""
|
||||
await asyncio.gather(self.wait_for_shutdown(), self.startup())
|
||||
await self.shutdown()
|
||||
|
||||
async def startup(self):
|
||||
"""
|
||||
Asynchronous startup.
|
||||
"""
|
||||
msg = f'Instance "{self}" starting components'
|
||||
self.logger.info(msg)
|
||||
self.search_engine = await startup_engine(self.config)
|
||||
self.pgpool = await PGPool(self.config['postgresql'])
|
||||
self.pool = self.pgpool.pool
|
||||
await load_seeds(self.config, self.pool)
|
||||
await reset_site_locks(self.pool)
|
||||
worker_count = self.config['crawl']['workers']
|
||||
self.workers = []
|
||||
for worker_number in range(worker_count):
|
||||
worker = await CrawlWorker(self, worker_number, self.pool)
|
||||
self.workers.append(worker)
|
||||
worker_coros = [worker.run() for worker in self.workers]
|
||||
await asyncio.gather(
|
||||
process_site_queue(self, self.pool),
|
||||
self.handle_notifications(),
|
||||
*worker_coros,
|
||||
)
|
||||
|
||||
async def wait_for_shutdown(self):
|
||||
"""
|
||||
Create a shutdown event (:class:`asyncio.Event`) and wait for it.
|
||||
|
||||
The event will be set by a signal handler for SIGINT
|
||||
and SIGTERM signals (see :meth:`Application.handle_shutdown_signal`).
|
||||
"""
|
||||
self.shutdown_event = asyncio.Event()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
asyncio.get_running_loop().add_signal_handler(
|
||||
sig, self.handle_shutdown_signal
|
||||
)
|
||||
self.logger.debug(f'{self} waiting for shutdown event')
|
||||
await self.shutdown_event.wait()
|
||||
self.logger.info(f'Instance "{self}" shutdown event')
|
||||
|
||||
def handle_shutdown_signal(self):
|
||||
"""
|
||||
Handle shutdown signal.
|
||||
"""
|
||||
if self.shutdown_event.is_set():
|
||||
return
|
||||
self.shutdown_event.set()
|
||||
self.running = False
|
||||
|
||||
async def shutdown(self):
|
||||
"""
|
||||
Asynchronous shutdown.
|
||||
"""
|
||||
self.logger.debug(f'Instance "{self}" shutting down')
|
||||
await self.notify_conn.remove_listener(
|
||||
self.channel, self.listen_callback
|
||||
)
|
||||
await self.pool.release(self.notify_conn)
|
||||
for worker in self.workers:
|
||||
await worker.shutdown()
|
||||
await shutdown_engine(self.search_engine)
|
||||
await self.pgpool.shutdown()
|
||||
self.logger.info(f'Instance "{self}" shutdown completed')
|
||||
|
||||
async def handle_notifications(self):
|
||||
"""
|
||||
Handle notifications using PostgreSQL's NOTIFY/LISTEN.
|
||||
"""
|
||||
self.notify_conn = await self.pool.acquire()
|
||||
await self.notify_conn.add_listener(self.channel, self.listen_callback)
|
||||
|
||||
def listen_callback(self, *args):
|
||||
"""
|
||||
Handle notify event from PostgreSQL.
|
||||
"""
|
||||
channel = args[2]
|
||||
if channel != self.channel:
|
||||
return
|
||||
message = args[3]
|
||||
if message.startswith('site_update '):
|
||||
try:
|
||||
site_id = int(message.removeprefix('site_update '))
|
||||
for worker in self.workers:
|
||||
if worker.site and site_id == worker.site.id_:
|
||||
msg = (
|
||||
f'Cancelling worker {worker.worker_number}'
|
||||
f' (site={site_id}) due to site_update'
|
||||
)
|
||||
self.logger.info(msg)
|
||||
worker.running = False
|
||||
except:
|
||||
pass
|
||||
|
||||
async def sleep(self, duration, t_slice=3):
|
||||
"""
|
||||
Sleep for *duration* seconds while self.running.
|
||||
|
||||
Check self.running every *t_slice* seconds.
|
||||
"""
|
||||
remaining = duration
|
||||
while remaining > 0 and self.running:
|
||||
await asyncio.sleep(min(t_slice, remaining))
|
||||
remaining -= t_slice
|
||||
|
||||
|
||||
async def reset_site_locks(pool):
|
||||
"""
|
||||
Remove locks leftover from last run: Set crawl_active=false for all sites.
|
||||
|
||||
This is relevant when the application was not shutdown properly (e.g.
|
||||
when the process was killed).
|
||||
"""
|
||||
async with pool.acquire() as conn:
|
||||
sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true"
|
||||
await conn.execute(sql)
|
7
src/atextcrawler/assets/iana_langs
Normal file
7
src/atextcrawler/assets/iana_langs
Normal file
|
@ -0,0 +1,7 @@
|
|||
The recommended language tags to use in webpages are from
|
||||
the IANA Language Subtag Registry (BCP47), see:
|
||||
https://www.w3.org/International/questions/qa-html-language-declarations
|
||||
https://r12a.github.io/app-subtags/
|
||||
|
||||
|
||||
wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_ | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //'
|
219
src/atextcrawler/assets/iso_639-1
Normal file
219
src/atextcrawler/assets/iso_639-1
Normal file
|
@ -0,0 +1,219 @@
|
|||
aa
|
||||
ab
|
||||
ae
|
||||
af
|
||||
ak
|
||||
am
|
||||
an
|
||||
ar
|
||||
as
|
||||
av
|
||||
ay
|
||||
az
|
||||
ba
|
||||
be
|
||||
bg
|
||||
bh
|
||||
bi
|
||||
bm
|
||||
bn
|
||||
bo
|
||||
br
|
||||
bs
|
||||
ca
|
||||
ca
|
||||
ce
|
||||
ch
|
||||
co
|
||||
cr
|
||||
cs
|
||||
cu
|
||||
cu
|
||||
cu
|
||||
cu
|
||||
cu
|
||||
cv
|
||||
cy
|
||||
da
|
||||
de
|
||||
dv
|
||||
dv
|
||||
dv
|
||||
dz
|
||||
ee
|
||||
el
|
||||
en
|
||||
eo
|
||||
es
|
||||
es
|
||||
et
|
||||
eu
|
||||
fa
|
||||
ff
|
||||
fi
|
||||
fj
|
||||
fo
|
||||
fr
|
||||
fy
|
||||
ga
|
||||
gd
|
||||
gd
|
||||
gl
|
||||
gn
|
||||
gu
|
||||
gv
|
||||
ha
|
||||
he
|
||||
hi
|
||||
ho
|
||||
hr
|
||||
ht
|
||||
ht
|
||||
hu
|
||||
hy
|
||||
hz
|
||||
ia
|
||||
id
|
||||
ie
|
||||
ie
|
||||
ig
|
||||
ii
|
||||
ii
|
||||
ik
|
||||
io
|
||||
is
|
||||
it
|
||||
iu
|
||||
ja
|
||||
jv
|
||||
ka
|
||||
kg
|
||||
ki
|
||||
ki
|
||||
kj
|
||||
kj
|
||||
kk
|
||||
kl
|
||||
kl
|
||||
km
|
||||
kn
|
||||
ko
|
||||
kr
|
||||
ks
|
||||
ku
|
||||
kv
|
||||
kw
|
||||
ky
|
||||
ky
|
||||
la
|
||||
lb
|
||||
lb
|
||||
lg
|
||||
li
|
||||
li
|
||||
li
|
||||
ln
|
||||
lo
|
||||
lt
|
||||
lu
|
||||
lv
|
||||
mg
|
||||
mh
|
||||
mi
|
||||
mk
|
||||
ml
|
||||
mn
|
||||
mr
|
||||
ms
|
||||
mt
|
||||
my
|
||||
na
|
||||
nb
|
||||
nb
|
||||
nd
|
||||
nd
|
||||
ne
|
||||
ng
|
||||
nl
|
||||
nl
|
||||
nn
|
||||
nn
|
||||
no
|
||||
nr
|
||||
nr
|
||||
nv
|
||||
nv
|
||||
ny
|
||||
ny
|
||||
ny
|
||||
oc
|
||||
oj
|
||||
om
|
||||
or
|
||||
os
|
||||
os
|
||||
pa
|
||||
pa
|
||||
pi
|
||||
pl
|
||||
ps
|
||||
ps
|
||||
pt
|
||||
qu
|
||||
rm
|
||||
rn
|
||||
ro
|
||||
ro
|
||||
ro
|
||||
ru
|
||||
rw
|
||||
sa
|
||||
sc
|
||||
sd
|
||||
se
|
||||
sg
|
||||
si
|
||||
si
|
||||
sk
|
||||
sl
|
||||
sm
|
||||
sn
|
||||
so
|
||||
sq
|
||||
sr
|
||||
ss
|
||||
st
|
||||
su
|
||||
sv
|
||||
sw
|
||||
ta
|
||||
te
|
||||
tg
|
||||
th
|
||||
ti
|
||||
tk
|
||||
tl
|
||||
tn
|
||||
to
|
||||
tr
|
||||
ts
|
||||
tt
|
||||
tw
|
||||
ty
|
||||
ug
|
||||
ug
|
||||
uk
|
||||
ur
|
||||
uz
|
||||
ve
|
||||
vi
|
||||
vo
|
||||
wa
|
||||
wo
|
||||
xh
|
||||
yi
|
||||
yo
|
||||
za
|
||||
za
|
||||
zh
|
||||
zu
|
10000
src/atextcrawler/assets/top_1e4
Normal file
10000
src/atextcrawler/assets/top_1e4
Normal file
File diff suppressed because it is too large
Load diff
337
src/atextcrawler/config.py
Normal file
337
src/atextcrawler/config.py
Normal file
|
@ -0,0 +1,337 @@
|
|||
"""
|
||||
Configuration loader and validator.
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from io import TextIOBase
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from voluptuous import All
|
||||
from voluptuous import Any as VAny
|
||||
from voluptuous import Invalid, Length, Range, Required, Schema, Url
|
||||
from yaml import load
|
||||
|
||||
try:
|
||||
from yaml import CLoader as Loader # type: ignore
|
||||
except ImportError:
|
||||
from yaml import Loader # type: ignore
|
||||
|
||||
|
||||
class ConfigError(Exception):
|
||||
"""
|
||||
Application configuration error.
|
||||
"""
|
||||
|
||||
def __init__(self, err):
|
||||
self.msg = str(err)
|
||||
|
||||
def __str__(self):
|
||||
return f'Application configuration error: {self.msg}'
|
||||
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Application configuration.
|
||||
|
||||
Access the full application configuration using :meth:`get`.
|
||||
|
||||
It is a dictionary with these keys:
|
||||
|
||||
* 'directory': the configuration directory being used
|
||||
* 'main': the main configuration from main.yaml, but
|
||||
postgresql configuration may be overriden by environment
|
||||
variable ATEXTCRAWLER_POSTGRESQL
|
||||
"""
|
||||
|
||||
config = None
|
||||
|
||||
@classmethod
|
||||
def get(
|
||||
cls,
|
||||
out: Optional[TextIOBase] = None,
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
Load and validate app configuration if not already done; return it.
|
||||
|
||||
On errors print them to *out* and if out is sys.stdout, then
|
||||
also exit with exit code 2. Otherwise just return None.
|
||||
"""
|
||||
if cls.config:
|
||||
return cls.config
|
||||
if out is None:
|
||||
out = sys.stdout # type: ignore
|
||||
_config = _load_config()
|
||||
msg = None
|
||||
if isinstance(_config, ConfigError):
|
||||
msg = f'ERROR: configuration could not be loaded: {_config}'
|
||||
else:
|
||||
config = _validate_config(_config)
|
||||
if isinstance(config, ConfigError):
|
||||
config_dir = _config.get('config_dir')
|
||||
msg = (
|
||||
f'ERROR: invalid configuration in {config_dir}:'
|
||||
f' {config}'
|
||||
)
|
||||
if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
|
||||
print(msg, file=out)
|
||||
if out == sys.stdout:
|
||||
sys.exit(2)
|
||||
else:
|
||||
return None
|
||||
config['postgresql']['min_size'] = config['crawl']['workers'] + 2
|
||||
config['postgresql']['max_size'] = config['crawl']['workers'] + 2
|
||||
cls.config = config
|
||||
return config
|
||||
|
||||
|
||||
def _load_config() -> Union[ConfigError, dict]:
|
||||
"""
|
||||
Load configuration; search in multiple directories.
|
||||
|
||||
We search these locations; the first location containing main.yaml
|
||||
will be used::
|
||||
|
||||
* a directory defined in environment variable ATEXTCRAWLER_CONF
|
||||
* subdir .config/atextcrawler in the user's home (`$HOME`)
|
||||
* /etc/atextcrawler
|
||||
|
||||
In the same directory where this main.conf is located a subdirectory
|
||||
'plugins' must exist and contain the configurations of plugins.
|
||||
|
||||
On failure return the first error and None.
|
||||
Otherwise return None and a dict with these keys:
|
||||
|
||||
* `directory`: the used configuration directory
|
||||
* `main`: the main application configuration
|
||||
* `plugins`: a dict mapping plugins names to plugin configurations
|
||||
"""
|
||||
Path(__file__).parent.parent
|
||||
config_dirs = []
|
||||
if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
|
||||
config_dirs.append(Path(env_conf))
|
||||
if env_home := os.environ.get('HOME'):
|
||||
config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
|
||||
config_dirs.append(Path('/etc/atextcrawler'))
|
||||
for config_dir in config_dirs:
|
||||
main_yaml_path = config_dir / 'main.yaml'
|
||||
if main_yaml_path.exists():
|
||||
break
|
||||
else:
|
||||
locs = ', '.join([str(loc) for loc in config_dirs if loc])
|
||||
msg = (
|
||||
f'Missing main.yaml in all config locations: {locs}\n'
|
||||
f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
|
||||
f' to define a custom config directory.'
|
||||
)
|
||||
return ConfigError(msg)
|
||||
|
||||
# load main.yaml
|
||||
try:
|
||||
with main_yaml_path.open() as main_yaml:
|
||||
main_config = load(main_yaml.read(), Loader=Loader)
|
||||
except Exception as err:
|
||||
return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
|
||||
|
||||
# main_config must be a dict
|
||||
if not isinstance(main_config, dict):
|
||||
return ConfigError(f'File {main_yaml_path} must contain a dictionary')
|
||||
|
||||
# postgresql config from environment has precedence
|
||||
postgresql_config = _get_env_postgresql()
|
||||
if isinstance(postgresql_config, ConfigError):
|
||||
return postgresql_config
|
||||
main_config['postgresql'] = postgresql_config or main_config['postgresql']
|
||||
|
||||
main_config['config_dir'] = str(config_dir)
|
||||
return main_config
|
||||
|
||||
|
||||
def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
|
||||
"""
|
||||
Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
|
||||
|
||||
Return an error or the PostgreSQL config (which can be None if
|
||||
the environment variable is not defined.
|
||||
"""
|
||||
env_var = 'ATEXTCRAWLER_POSTGRESQL'
|
||||
value = os.environ.get(env_var, '').strip()
|
||||
if not value:
|
||||
return None
|
||||
param_names = (
|
||||
'host',
|
||||
'port',
|
||||
'database',
|
||||
'user',
|
||||
'password',
|
||||
'schema_name',
|
||||
)
|
||||
re_dsn = re.compile(
|
||||
'((' + '|'.join(param_names) + ')'
|
||||
'=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes
|
||||
'|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes
|
||||
'|([^"\' ]*)' # value unquoted
|
||||
')( |$))+?'
|
||||
)
|
||||
params = {}
|
||||
for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
|
||||
params[varname] = (
|
||||
v3
|
||||
or (v1 or '').replace('\\"', '"')
|
||||
or (v2 or '').replace("\\'", "'")
|
||||
)
|
||||
if 'host' not in params:
|
||||
params['host'] = 'localhost'
|
||||
if 'port' not in params:
|
||||
params['port'] = '5432'
|
||||
if 'schema_name' not in params:
|
||||
params['schema_name'] = 'public'
|
||||
for name in param_names:
|
||||
if name not in params:
|
||||
return ConfigError(
|
||||
f'Missing {name} in environment variable {env_var}'
|
||||
)
|
||||
else:
|
||||
params['port'] = int(params['port'])
|
||||
return params
|
||||
|
||||
|
||||
def _validate_config(config: Any) -> Union[ConfigError, dict]:
|
||||
"""
|
||||
Validate the given configuration and fill in default values.
|
||||
|
||||
If invalid, return only the first error.
|
||||
Otherwise return the configuration with added default values.
|
||||
"""
|
||||
try:
|
||||
return schema_main(config)
|
||||
except Exception as err:
|
||||
return ConfigError(err)
|
||||
|
||||
|
||||
def plugins_dir(config):
|
||||
"""
|
||||
Validate plugins directory (absolute or relative path).
|
||||
|
||||
If it is a relative path, prepend the config_dir.
|
||||
"""
|
||||
config_dir = config['config_dir']
|
||||
plugins_dir = config['plugins_dir']
|
||||
if plugins_dir.startswith('/'):
|
||||
try:
|
||||
plugins_dir = Path(plugins_dir)
|
||||
except:
|
||||
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
|
||||
else:
|
||||
try:
|
||||
plugins_dir = str(Path(config_dir) / Path(plugins_dir))
|
||||
config['plugins_dir'] = plugins_dir
|
||||
except:
|
||||
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
|
||||
if not (Path(plugins_dir) / '__init__.py').exists():
|
||||
raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
|
||||
return config
|
||||
|
||||
|
||||
def postgresql_identifier(value):
|
||||
"""
|
||||
Validate a PostgreSQL identifier.
|
||||
"""
|
||||
if not isinstance(value, str) or not re.match(
|
||||
'^[a-z][a-z0-9_]{0,30}$', value
|
||||
):
|
||||
raise Invalid(
|
||||
f'Invalid PostgreSQL identifier "{value}", '
|
||||
f'pattern must be: [a-z][a-z0-9_]{0,30}'
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def positive_number(value):
|
||||
"""
|
||||
Validate a positive number (int or float).
|
||||
"""
|
||||
if (isinstance(value, int) or isinstance(value, float)) and value > 0:
|
||||
return value
|
||||
raise Invalid('Not a positive number')
|
||||
|
||||
|
||||
schema_postgresql = Schema(
|
||||
{
|
||||
Required('host'): All(str, Length(min=1)),
|
||||
Required('port', default=5432): All(int, Range(min=0, max=65535)),
|
||||
Required('database'): All(str, Length(min=1)),
|
||||
Required('user'): All(str, Length(min=1)),
|
||||
Required('password'): str,
|
||||
Required('schema_name', default='public'): postgresql_identifier,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
schema_crawl = Schema(
|
||||
{
|
||||
Required('workers', default=10): All(int, Range(min=0, max=1000)),
|
||||
Required('site_delay', default=600): positive_number,
|
||||
Required('site_revisit_interval', default=3600): positive_number,
|
||||
Required('resource_delay', default=5): positive_number,
|
||||
Required('full_crawl_interval', default=864000): positive_number,
|
||||
Required('feed_crawl_interval', default=86400): positive_number,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
schema_elasticsearch = Schema(
|
||||
{
|
||||
Required('host'): All(str, Length(min=1)),
|
||||
Required('api_key'): All(str, Length(min=1)),
|
||||
Required('id'): All(str, Length(min=1)),
|
||||
Required('index_base_name'): All(str, Length(min=1)),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
schema_tensorflow = Schema(
|
||||
{
|
||||
Required('model_server_endpoint'): Url(),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
schema_main = Schema(
|
||||
All(
|
||||
{
|
||||
Required('config_dir'): All(str, Length(min=1)),
|
||||
Required(
|
||||
'instance_name', default='atextcrawler'
|
||||
): postgresql_identifier,
|
||||
Required('instance_type', default='prod'): VAny(
|
||||
'dev',
|
||||
'staging',
|
||||
'prod',
|
||||
),
|
||||
Required('log_level', default='info'): VAny(
|
||||
'critical',
|
||||
'error',
|
||||
'warning',
|
||||
'info',
|
||||
'debug',
|
||||
),
|
||||
Required('plugins_dir', default='plugins'): All(
|
||||
str, Length(min=1)
|
||||
),
|
||||
Required('postgresql'): schema_postgresql,
|
||||
Required('crawl'): schema_crawl,
|
||||
Required('elasticsearch'): schema_elasticsearch,
|
||||
Required('tensorflow'): schema_tensorflow,
|
||||
},
|
||||
plugins_dir,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from pprint import pprint
|
||||
|
||||
pprint(Config().get())
|
215
src/atextcrawler/crawl.py
Normal file
215
src/atextcrawler/crawl.py
Normal file
|
@ -0,0 +1,215 @@
|
|||
"""
|
||||
Crawl a site.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
import aiohttp
|
||||
|
||||
from .models import Crawl
|
||||
from .resource import ResourceFetcher, get_site_path, process_site_path
|
||||
from .site import (
|
||||
RobotsInfo,
|
||||
checkin_site,
|
||||
checkout_site,
|
||||
fetch_feeds,
|
||||
process_site,
|
||||
update_site,
|
||||
)
|
||||
from .tensorflow import TensorFlow
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CrawlWorker:
|
||||
"""
|
||||
Worker fetching sites, crawling their resources and storing statistics.
|
||||
"""
|
||||
|
||||
def __init__(self, app, worker_number, pool):
|
||||
self.app = app
|
||||
self.worker_number = worker_number
|
||||
self.pool = pool
|
||||
self.site_delay = self.app.config['crawl']['site_delay']
|
||||
self.resource_delay = self.app.config['crawl']['resource_delay']
|
||||
self.site = None
|
||||
self.crawl = None
|
||||
self.running = True # do crawl
|
||||
|
||||
def __await__(self):
|
||||
return self.__ainit__().__await__()
|
||||
|
||||
async def __ainit__(self):
|
||||
await self.startup()
|
||||
return self
|
||||
|
||||
async def startup(self):
|
||||
"""
|
||||
Asynchronous startup.
|
||||
"""
|
||||
logger.info(f'Starting worker {self.worker_number}')
|
||||
self.conn = await self.pool.acquire()
|
||||
self.session = aiohttp.ClientSession()
|
||||
self.fetcher = ResourceFetcher(self.session)
|
||||
self.tf = TensorFlow(self.app, self.session)
|
||||
|
||||
async def shutdown(self):
|
||||
"""
|
||||
Asynchronous shutdown.
|
||||
"""
|
||||
logger.info(f'Shutting down worker {self.worker_number}')
|
||||
await self.session.close()
|
||||
await self.pool.release(self.conn)
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Worker loop: fetch a site, crawl its resources and store statistics.
|
||||
|
||||
If no site needs to be crawled, sleep for self.site_delay seconds
|
||||
(configured in crawl.site_delay).
|
||||
"""
|
||||
await self.app.sleep(2)
|
||||
while self.app.running and self.running:
|
||||
self.site, is_full, more = await checkout_site(self.app, self.conn)
|
||||
if not self.site:
|
||||
msg = f'Worker {self.worker_number}: sites exhausted'
|
||||
logger.debug(msg)
|
||||
if not more:
|
||||
await self.app.sleep(self.site_delay)
|
||||
continue
|
||||
self.crawl = await get_or_create_crawl(
|
||||
self.conn, self.site.id_, is_full
|
||||
)
|
||||
try:
|
||||
if is_full:
|
||||
site_upd, _ = await update_site(
|
||||
self.app,
|
||||
self.fetcher,
|
||||
self.conn,
|
||||
self.site.base_url,
|
||||
site=self.site,
|
||||
)
|
||||
if site_upd and site_upd.crawl_enabled:
|
||||
self.site = site_upd
|
||||
await process_site(
|
||||
self.fetcher,
|
||||
self.conn,
|
||||
self.site,
|
||||
)
|
||||
elif self.site.crawl_enabled:
|
||||
await fetch_feeds(self.fetcher, self.conn, self.site)
|
||||
if self.site.crawl_enabled:
|
||||
await self.crawl_resources()
|
||||
except:
|
||||
msg = (
|
||||
f'Worker {self.worker_number} failed crawl'
|
||||
f' {self.crawl.id_} of site {self.site.id_}'
|
||||
f' ({self.site.base_url})'
|
||||
)
|
||||
logger.exception(msg)
|
||||
await self.crawl.finish(
|
||||
self.conn, self.app.running and self.running
|
||||
)
|
||||
await checkin_site(self.app, self.conn, self.site, self.crawl)
|
||||
msg = (
|
||||
f'Worker {self.worker_number} finished crawl'
|
||||
f' {self.crawl.id_}'
|
||||
)
|
||||
logger.debug(msg)
|
||||
self.site = None
|
||||
# if we were cancelled, but the app is still running, run again
|
||||
if self.app.running:
|
||||
self.running = True
|
||||
msg = f'Closing crawler {self.worker_number}'
|
||||
logger.debug(msg)
|
||||
|
||||
async def crawl_resources(self):
|
||||
"""
|
||||
Loop over resources of the site and process them. Collect statistics.
|
||||
|
||||
All workers operate on distinct sites, so no need for locking here.
|
||||
"""
|
||||
crawl_type = 'full' if self.crawl.is_full else 'feed'
|
||||
msg = (
|
||||
f'Worker {self.worker_number} beginning'
|
||||
f' {crawl_type} crawl {self.crawl.id_}'
|
||||
f' of site {self.site.id_} ({self.site.base_url})'
|
||||
)
|
||||
logger.info(msg)
|
||||
resource_delay = self.resource_delay
|
||||
robots = await RobotsInfo(self.site.base_url)
|
||||
if robots.delay:
|
||||
resource_delay = robots.delay
|
||||
while self.app.running and self.running:
|
||||
site_path = await get_site_path(
|
||||
self.conn,
|
||||
self.site,
|
||||
self.crawl.t_begin,
|
||||
only_new=not self.crawl.is_full,
|
||||
)
|
||||
if not site_path:
|
||||
msg = (
|
||||
f'Worker {self.worker_number} ending crawl'
|
||||
f' {self.crawl.id_}: paths exhausted'
|
||||
)
|
||||
logger.info(msg)
|
||||
return
|
||||
try:
|
||||
sp_filter = self.app.plugins['filter_site_path'].sp_filter
|
||||
if sp_filter(self.site, site_path.path, robots):
|
||||
is_new_resource = await process_site_path(
|
||||
self.app,
|
||||
self.worker_number,
|
||||
self.conn,
|
||||
self.fetcher,
|
||||
self.tf,
|
||||
self.site,
|
||||
site_path,
|
||||
)
|
||||
if is_new_resource:
|
||||
self.crawl.n_resources_new += 1
|
||||
if is_new_resource is not None:
|
||||
self.crawl.n_resources += 1
|
||||
await self.app.sleep(resource_delay)
|
||||
else:
|
||||
sql = (
|
||||
"UPDATE site_path SET"
|
||||
" last_visit=now() at time zone 'UTC',"
|
||||
" filtered=true"
|
||||
" WHERE id=$1"
|
||||
)
|
||||
await self.conn.execute(sql, site_path.id_)
|
||||
except:
|
||||
msg = (
|
||||
f'Worker {self.worker_number} processing path failed'
|
||||
f' in crawl {self.crawl.id_}: {site_path}'
|
||||
)
|
||||
logger.exception(msg)
|
||||
site_path.ok_count -= 1
|
||||
await site_path.save(self.conn)
|
||||
msg = (
|
||||
f'Worker {self.worker_number}: stopped crawl' f' {self.crawl.id_}'
|
||||
)
|
||||
logger.info(msg)
|
||||
|
||||
|
||||
async def get_or_create_crawl(conn, site_id, is_full=True) -> Crawl:
|
||||
"""
|
||||
Return a new or existing+unfinished crawl.
|
||||
|
||||
If an existing crawl is found, return it, disregarding whether
|
||||
it is a full crawl or not.
|
||||
"""
|
||||
sql = "SELECT * FROM crawl WHERE site_id=$1 AND t_end is null LIMIT 1"
|
||||
if row := await conn.fetchrow(sql, site_id):
|
||||
return await Crawl().load_from_row(row)
|
||||
else:
|
||||
# create a new crawl
|
||||
crawl = Crawl(
|
||||
site_id=site_id,
|
||||
is_full=is_full,
|
||||
t_begin=datetime.utcnow(),
|
||||
)
|
||||
await crawl.save(conn)
|
||||
return crawl
|
162
src/atextcrawler/db.py
Normal file
162
src/atextcrawler/db.py
Normal file
|
@ -0,0 +1,162 @@
|
|||
"""
|
||||
PostgreSQL connectivity.
|
||||
|
||||
PGPool can be used as context manager. It takes postgresql configuration
|
||||
parameters and gives a connection pool.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from io import TextIOBase
|
||||
from pathlib import Path
|
||||
from traceback import format_exc
|
||||
from typing import Dict
|
||||
|
||||
import asyncpg
|
||||
|
||||
from .utils.json import json_dumps, json_loads
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PGPool:
|
||||
"""
|
||||
Database connectivity: Provide a connection pool.
|
||||
|
||||
Can be used either as async context manager (giving a pool),
|
||||
or as a class using async init and the shutdown method and
|
||||
having the pool attribute.
|
||||
|
||||
After startup self.pool contains a PostgreSQL connection pool
|
||||
(instance of :class:`asyncpg.pool.Pool`).
|
||||
|
||||
Startup also runs schema migrations (cf. directory `migrations`).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
postgresql_config: dict,
|
||||
out: TextIOBase = None,
|
||||
check: bool = True,
|
||||
) -> None:
|
||||
self.conf = postgresql_config
|
||||
self.out = out or sys.stdout
|
||||
self.check = check
|
||||
self.pool = None
|
||||
|
||||
def __await__(self):
|
||||
return self.__ainit__().__await__()
|
||||
|
||||
async def __ainit__(self):
|
||||
await self.__aenter__()
|
||||
return self
|
||||
|
||||
async def __aenter__(self):
|
||||
"""
|
||||
Return the connection pool after an optional check.
|
||||
|
||||
The check tests basic database access and runs missing migrations.
|
||||
If the check fails, return None.
|
||||
"""
|
||||
pool_params = {
|
||||
key: val
|
||||
for key, val in self.conf.items()
|
||||
if key
|
||||
in (
|
||||
'host',
|
||||
'port',
|
||||
'database',
|
||||
'user',
|
||||
'password',
|
||||
'max_size',
|
||||
'min_size',
|
||||
)
|
||||
}
|
||||
pool_params['command_timeout'] = 30
|
||||
self.pool = await asyncpg.create_pool(**pool_params, init=self._init)
|
||||
if self.check:
|
||||
async with self.pool.acquire() as conn:
|
||||
if await self.check_or_migrate(conn):
|
||||
return self.pool
|
||||
|
||||
@staticmethod
|
||||
async def _init(conn) -> None:
|
||||
"""
|
||||
Add JSON encoding and decoding to the given connection.
|
||||
"""
|
||||
await conn.set_type_codec(
|
||||
'jsonb',
|
||||
encoder=json_dumps,
|
||||
decoder=json_loads,
|
||||
schema='pg_catalog',
|
||||
)
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb) -> None:
|
||||
"""
|
||||
Close the connection pool.
|
||||
"""
|
||||
await self.shutdown()
|
||||
|
||||
async def shutdown(self):
|
||||
"""
|
||||
Close the pool.
|
||||
"""
|
||||
await self.pool.close()
|
||||
|
||||
async def check_or_migrate(self, conn: asyncpg.Connection) -> bool:
|
||||
"""
|
||||
Check database connectivity.
|
||||
|
||||
Return whether database connectivity is working.
|
||||
"""
|
||||
row = await conn.fetchrow('SELECT 1+1 AS result')
|
||||
if not row or row.get('result') != 2:
|
||||
msg = 'Database SELECT 1+1 not working; missing privileges?'
|
||||
print(msg, file=self.out)
|
||||
logger.critical(msg)
|
||||
return False
|
||||
|
||||
# determine current schema_version
|
||||
try:
|
||||
sql = "SELECT value::int FROM kvs WHERE key='schema_version'"
|
||||
schema_version = await conn.fetchval(sql)
|
||||
except:
|
||||
schema_version = 0
|
||||
|
||||
# run missing migrations
|
||||
migrations = get_migrations()
|
||||
for number, text in sorted(migrations.items()):
|
||||
if number > schema_version:
|
||||
cmds = text.split('\n----\n')
|
||||
for cmd in cmds:
|
||||
if not cmd.strip():
|
||||
continue
|
||||
try:
|
||||
await conn.execute(cmd)
|
||||
except:
|
||||
msg = (
|
||||
f'Exception during migration {number} in '
|
||||
f'statement\n{cmd}'
|
||||
)
|
||||
print(msg, file=self.out)
|
||||
logger.critical(msg)
|
||||
print(format_exc(), file=self.out)
|
||||
logger.critical(format_exc())
|
||||
return False
|
||||
|
||||
# return success
|
||||
return True
|
||||
|
||||
|
||||
def get_migrations() -> Dict[int, str]:
|
||||
"""
|
||||
Return migrations (number and text content of migration file).
|
||||
"""
|
||||
migrations_dir = Path(__file__).parent / 'migrations'
|
||||
migrations = {}
|
||||
for migration_file in migrations_dir.glob('*.sql'):
|
||||
migration_number = int(migration_file.name[:-4])
|
||||
with migration_file.open() as mig_file:
|
||||
content = mig_file.read()
|
||||
migrations[migration_number] = content
|
||||
return migrations
|
297
src/atextcrawler/migrations/1.sql
Normal file
297
src/atextcrawler/migrations/1.sql
Normal file
|
@ -0,0 +1,297 @@
|
|||
CREATE TABLE kvs (
|
||||
id bigserial PRIMARY KEY,
|
||||
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
|
||||
key varchar(200) NOT NULL UNIQUE,
|
||||
value jsonb
|
||||
)
|
||||
----
|
||||
COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry';
|
||||
----
|
||||
COMMENT ON COLUMN kvs.key IS 'Key';
|
||||
----
|
||||
COMMENT ON COLUMN kvs.value IS 'Value';
|
||||
----
|
||||
COMMENT ON TABLE kvs IS 'Simple key-value store';
|
||||
----
|
||||
INSERT INTO kvs (key, value) VALUES ('schema_version', '1');
|
||||
----
|
||||
CREATE TABLE site (
|
||||
id bigserial PRIMARY KEY,
|
||||
canonical_url varchar(200),
|
||||
base_url varchar(200) NOT NULL,
|
||||
base_urls varchar(200)[] NOT NULL,
|
||||
domains varchar(100)[],
|
||||
ips inet[] NULL,
|
||||
crawl_enabled bool NOT NULL DEFAULT false,
|
||||
crawl_active bool NOT NULL DEFAULT false,
|
||||
next_full_crawl timestamp,
|
||||
next_feed_crawl timestamp,
|
||||
last_update timestamp,
|
||||
last_pub timestamp,
|
||||
pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[],
|
||||
alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
title varchar(200),
|
||||
description varchar(2000),
|
||||
keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[],
|
||||
linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
meta_info jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||
boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb
|
||||
)
|
||||
----
|
||||
CREATE INDEX site__base_url ON site (base_url)
|
||||
----
|
||||
CREATE INDEX site__base_urls ON site (base_urls)
|
||||
----
|
||||
CREATE INDEX site__domains ON site (domains)
|
||||
----
|
||||
CREATE INDEX site__ips ON site (ips)
|
||||
----
|
||||
CREATE INDEX site__next_full_crawl ON site (next_full_crawl)
|
||||
----
|
||||
CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl)
|
||||
----
|
||||
CREATE INDEX site__langs ON site (langs)
|
||||
----
|
||||
CREATE INDEX site__title ON site (title)
|
||||
----
|
||||
CREATE INDEX site__description ON site (description)
|
||||
----
|
||||
CREATE INDEX site__keywords ON site (keywords)
|
||||
----
|
||||
COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)'
|
||||
----
|
||||
COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content'
|
||||
----
|
||||
COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content'
|
||||
----
|
||||
COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls'
|
||||
----
|
||||
COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed'
|
||||
----
|
||||
COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress'
|
||||
----
|
||||
COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null'
|
||||
----
|
||||
COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null'
|
||||
----
|
||||
COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)'
|
||||
----
|
||||
COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site'
|
||||
----
|
||||
COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date'
|
||||
----
|
||||
COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)'
|
||||
----
|
||||
COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes'
|
||||
----
|
||||
COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags'
|
||||
----
|
||||
COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags'
|
||||
----
|
||||
COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags'
|
||||
----
|
||||
COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)'
|
||||
----
|
||||
COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information'
|
||||
----
|
||||
COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages'
|
||||
----
|
||||
COMMENT ON TABLE site IS 'Website'
|
||||
----
|
||||
CREATE TABLE site_queue (
|
||||
id bigserial PRIMARY KEY,
|
||||
src bigint NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||
url varchar(200) NOT NULL,
|
||||
link_text varchar(100),
|
||||
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc')
|
||||
)
|
||||
----
|
||||
CREATE INDEX site_queue__url ON site_queue (url)
|
||||
----
|
||||
COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions'
|
||||
----
|
||||
COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path'
|
||||
----
|
||||
COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site'
|
||||
----
|
||||
COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry'
|
||||
----
|
||||
COMMENT ON TABLE site_queue IS 'Queued site URLs'
|
||||
----
|
||||
CREATE TABLE site_feed (
|
||||
id bigserial PRIMARY KEY,
|
||||
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||
url varchar(200) NOT NULL,
|
||||
etag text,
|
||||
modified varchar(50),
|
||||
t_visit timestamp,
|
||||
t_content timestamp,
|
||||
version varchar(10),
|
||||
title varchar(200),
|
||||
description text,
|
||||
fail_count smallint NOT NULL DEFAULT 0
|
||||
)
|
||||
----
|
||||
CREATE INDEX site_feed__site ON site_feed (site_id)
|
||||
----
|
||||
CREATE INDEX site_feed__t_content ON site_feed (t_content)
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.url IS 'URL of the feed'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival'
|
||||
----
|
||||
COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival'
|
||||
----
|
||||
CREATE TABLE site_link (
|
||||
id bigserial PRIMARY KEY,
|
||||
src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||
dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
|
||||
link_text varchar(100)
|
||||
)
|
||||
----
|
||||
ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst)
|
||||
----
|
||||
CREATE INDEX site_link__src ON site_link (src)
|
||||
----
|
||||
CREATE INDEX site_link__dst ON site_link (dst)
|
||||
----
|
||||
COMMENT ON COLUMN site_link.src IS 'Source site'
|
||||
----
|
||||
COMMENT ON COLUMN site_link.dst IS 'Destination site'
|
||||
----
|
||||
COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry'
|
||||
----
|
||||
COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site'
|
||||
----
|
||||
COMMENT ON TABLE site_link IS 'Cross-site link'
|
||||
----
|
||||
CREATE TABLE resource (
|
||||
id bigserial PRIMARY KEY,
|
||||
simhash bigint,
|
||||
content_type varchar(50),
|
||||
last_change timestamp,
|
||||
text_len int,
|
||||
lang char(2),
|
||||
title varchar(200),
|
||||
summary varchar(2000)
|
||||
)
|
||||
----
|
||||
COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource'
|
||||
----
|
||||
COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header'
|
||||
----
|
||||
COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource'
|
||||
----
|
||||
COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters'
|
||||
----
|
||||
COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code'
|
||||
----
|
||||
COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)'
|
||||
----
|
||||
COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)'
|
||||
----
|
||||
COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)'
|
||||
----
|
||||
CREATE TABLE site_path (
|
||||
id bigserial PRIMARY KEY,
|
||||
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||
path varchar(400) NOT NULL,
|
||||
last_visit timestamp,
|
||||
filtered bool NOT NULL DEFAULT false,
|
||||
ok_count smallint NOT NULL DEFAULT 0,
|
||||
canonical bool,
|
||||
resource_id bigint REFERENCES resource(id) ON DELETE CASCADE
|
||||
)
|
||||
----
|
||||
ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path)
|
||||
----
|
||||
CREATE INDEX site_path__site_path ON site_path (site_id, path)
|
||||
----
|
||||
CREATE INDEX site_path__resource ON site_path (resource_id)
|
||||
----
|
||||
COMMENT ON COLUMN site_path.site_id IS 'Site id'
|
||||
----
|
||||
COMMENT ON COLUMN site_path.path IS 'Path'
|
||||
----
|
||||
COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival'
|
||||
----
|
||||
COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed'
|
||||
----
|
||||
COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival'
|
||||
----
|
||||
COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval'
|
||||
----
|
||||
COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources'
|
||||
----
|
||||
CREATE TABLE crawl (
|
||||
id bigserial PRIMARY KEY,
|
||||
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||
is_full bool NOT NULL DEFAULT false,
|
||||
t_begin timestamp,
|
||||
t_end timestamp,
|
||||
n_resources int NOT NULL DEFAULT 0,
|
||||
n_resources_new int NOT NULL DEFAULT 0
|
||||
)
|
||||
----
|
||||
CREATE INDEX crawl__site ON crawl (site_id)
|
||||
----
|
||||
CREATE INDEX crawl__t_begin ON crawl (t_begin)
|
||||
----
|
||||
COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled'
|
||||
----
|
||||
COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl'
|
||||
----
|
||||
COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl'
|
||||
----
|
||||
COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin'
|
||||
----
|
||||
COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl'
|
||||
----
|
||||
COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl'
|
||||
----
|
||||
COMMENT ON TABLE resource IS 'Crawl of resources on a site'
|
||||
----
|
||||
CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale')
|
||||
----
|
||||
COMMENT ON TYPE site_annotation_type IS 'Type of site annotation'
|
||||
----
|
||||
CREATE TABLE site_annotation (
|
||||
id bigserial PRIMARY KEY,
|
||||
site_id bigint REFERENCES site(id) ON DELETE SET NULL,
|
||||
base_url varchar(200) NOT NULL,
|
||||
ann_type site_annotation_type NOT NULL,
|
||||
ann_content JSONB,
|
||||
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc')
|
||||
)
|
||||
----
|
||||
CREATE INDEX site_annotation__site ON site_annotation (site_id)
|
||||
----
|
||||
CREATE INDEX site_annotation__base_url ON site_annotation (base_url)
|
||||
----
|
||||
COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated'
|
||||
----
|
||||
COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated'
|
||||
----
|
||||
COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type'
|
||||
----
|
||||
COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content'
|
||||
----
|
||||
COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update'
|
||||
----
|
||||
COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'
|
610
src/atextcrawler/models.py
Normal file
610
src/atextcrawler/models.py
Normal file
|
@ -0,0 +1,610 @@
|
|||
"""
|
||||
Data Models.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import InitVar, asdict, dataclass, field, fields
|
||||
from datetime import date, datetime
|
||||
from itertools import chain
|
||||
from typing import Any, ClassVar, Optional
|
||||
|
||||
import tldextract
|
||||
from asyncpg import Connection
|
||||
|
||||
from .search import delete_resource
|
||||
from .utils.durl import Durl, get_url_variants
|
||||
from .utils.link import extract_domain
|
||||
from .utils.similarity import get_simhash, simhash_to_bigint
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ModelBase:
|
||||
"""
|
||||
Abstract base class for models.
|
||||
|
||||
Execute SQL to load, save, delete instances using asyncpg.
|
||||
"""
|
||||
|
||||
table: ClassVar
|
||||
id_: Optional[int] = 0
|
||||
|
||||
async def load(self, conn: Connection, id_: int) -> Optional[Any]:
|
||||
"""
|
||||
If loading fails, return None.
|
||||
"""
|
||||
sql = f"SELECT * FROM {self.table} WHERE id=$1"
|
||||
row = await conn.fetchrow(sql, id_)
|
||||
if not row:
|
||||
return None
|
||||
return await self.load_from_row(row)
|
||||
|
||||
async def load_from_row(self, row):
|
||||
"""
|
||||
If row is None, return None.
|
||||
"""
|
||||
if not row:
|
||||
return None
|
||||
data = dict(row)
|
||||
self.id_ = data.pop('id')
|
||||
self.__init__(**data)
|
||||
return self
|
||||
|
||||
async def save(self, conn: Connection) -> None:
|
||||
"""
|
||||
Save the instance (update if self.id_ is set, else insert).
|
||||
"""
|
||||
data = asdict(self)
|
||||
# logger.debug(f'Save {self}: id_={self.id_}')
|
||||
if self.id_: # update
|
||||
cols = ', '.join(data.keys())
|
||||
upds = ', '.join(
|
||||
[f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
|
||||
)
|
||||
val_id = f'${len(data) + 1}'
|
||||
sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
|
||||
await conn.execute(sql, *data.values(), self.id_)
|
||||
else: # insert
|
||||
cols = ', '.join(data.keys())
|
||||
vals = ', '.join([f'${i + 1}' for i in range(len(data))])
|
||||
sql = (
|
||||
f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
|
||||
f" RETURNING id"
|
||||
)
|
||||
self.id_ = await conn.fetchval(sql, *data.values())
|
||||
|
||||
def asdict(self):
|
||||
"""
|
||||
Return instance data as dictionary.
|
||||
"""
|
||||
return asdict(self)
|
||||
|
||||
async def delete(self, conn: Connection) -> None:
|
||||
"""
|
||||
Delete the object if it has an id_.
|
||||
"""
|
||||
if self.id_:
|
||||
sql = f"DELETE FROM {self.table} WHERE id=$1"
|
||||
await conn.execute(sql, self.id_)
|
||||
|
||||
|
||||
class ResourceError:
|
||||
"""
|
||||
Error encountered while trying to fetch a resource.
|
||||
|
||||
ResourceError is used for cases when fetching a resource fails.
|
||||
"""
|
||||
|
||||
def __init__(self, msg, status=None, headers=None):
|
||||
self.msg = msg
|
||||
self.status = status
|
||||
self.headers = headers
|
||||
|
||||
def __repr__(self):
|
||||
return f'ResourceError: {self.msg}'
|
||||
|
||||
|
||||
class ResourceRedirect:
|
||||
"""
|
||||
A resource containing a redirect.
|
||||
"""
|
||||
|
||||
def __init__(self, urls):
|
||||
self.urls = urls
|
||||
|
||||
|
||||
@dataclass
|
||||
class TextResource(ModelBase):
|
||||
"""
|
||||
TextResource (without path).
|
||||
|
||||
TextResource models web resources with relevant text content.
|
||||
They are instantiated in modules page, document, ...; their metadata
|
||||
are stored in table `resource` and the text content is stored with the
|
||||
search engine.
|
||||
|
||||
Do not confuse with SitePath: Several SitePath instances
|
||||
may point to a TextResource. The TextResource holds the actual content.
|
||||
|
||||
If we are not dealing with the startpage of a new site,
|
||||
the init_fields dict usually will contain the site to which
|
||||
the resource belongs.
|
||||
"""
|
||||
|
||||
table: ClassVar = 'resource'
|
||||
init_fields: InitVar[dict] = None # additional fields after fetching
|
||||
search_fields: InitVar[dict] = None # additional fields for indexing
|
||||
|
||||
# database fields
|
||||
simhash: Optional[int] = None
|
||||
content_type: Optional[str] = None
|
||||
last_change: Optional[datetime] = None
|
||||
text_len: int = 0
|
||||
lang: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
summary: Optional[str] = None
|
||||
|
||||
def __post_init__(self, init_fields, search_fields):
|
||||
if init_fields is None:
|
||||
init_fields = {}
|
||||
self.init_fields = init_fields
|
||||
if search_fields is None:
|
||||
search_fields = {}
|
||||
self.search_fields = search_fields
|
||||
self.site = self.init_fields.get('site')
|
||||
self.site_id = self.site.id_ if self.site else None
|
||||
self._update_simhash()
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f'TextResource(id={self.id_},'
|
||||
f' site_id={self.site_id},'
|
||||
f' type={self.content_type})'
|
||||
)
|
||||
|
||||
def _update_simhash(self):
|
||||
"""
|
||||
Update the simhash of the resource from its text content.
|
||||
"""
|
||||
if self.simhash is None:
|
||||
text = self.search_fields.get('text', '')
|
||||
self.simhash = simhash_to_bigint(get_simhash(text))
|
||||
|
||||
async def save(self, conn: Connection):
|
||||
"""
|
||||
Save the instance, extending the parent's method.
|
||||
"""
|
||||
self.content_type = (
|
||||
self.content_type[:50] if self.content_type else None
|
||||
)
|
||||
self.title = self.title[:200] if self.title else None
|
||||
self.summary = self.summary[:400] if self.summary else None
|
||||
self._update_simhash()
|
||||
if self.last_change is None:
|
||||
self.last_change = datetime.utcnow()
|
||||
await super().save(conn)
|
||||
|
||||
async def update_from_resource(self, upd: 'TextResource'):
|
||||
"""
|
||||
Update self with values from another resource.
|
||||
"""
|
||||
names = [field.name for field in fields(self)]
|
||||
for name in names:
|
||||
cur_val = getattr(self, name)
|
||||
upd_val = getattr(upd, name)
|
||||
if not cur_val and upd_val is not None:
|
||||
setattr(self, name, upd_val)
|
||||
init_names = [
|
||||
'headers',
|
||||
'redirects',
|
||||
'links_int',
|
||||
'links_ext',
|
||||
'shortlinks',
|
||||
'canonical',
|
||||
#'head',
|
||||
]
|
||||
self.init_fields = upd.init_fields
|
||||
self.search_fields = upd.search_fields
|
||||
# for init_name in init_names:
|
||||
# cur_val = self.init_fields.get(init_name)
|
||||
# upd_val = upd.init_fields.get(init_name)
|
||||
# if not cur_val and upd_val is not None:
|
||||
# self.init_fields[init_name] = upd_val
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetaResource(ModelBase):
|
||||
"""
|
||||
Parent class for Feed, Sitemap, SitemapIndex.
|
||||
|
||||
MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
|
||||
Their instances are not stored. Note: class Feed contains feed meta data
|
||||
and is stored in the database.
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapIndex(MetaResource):
|
||||
"""
|
||||
A SitemapIndex meta resource.
|
||||
|
||||
Just a list of the siteap URLs, nothing more.
|
||||
"""
|
||||
|
||||
sitemaps: list = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Sitemap(MetaResource):
|
||||
"""
|
||||
A Sitemap meta resource.
|
||||
|
||||
Just a list of the resulting links, nothing more.
|
||||
"""
|
||||
|
||||
urls: list = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Feed(MetaResource):
|
||||
"""
|
||||
A site's feed (RSS, Atom , ...).
|
||||
"""
|
||||
|
||||
table: ClassVar = 'site_feed'
|
||||
entries: InitVar[list] = None
|
||||
site_id: Optional[int] = None
|
||||
url: Optional[str] = None
|
||||
etag: Optional[str] = None
|
||||
modified: Optional[str] = None
|
||||
t_visit: Optional[datetime] = None
|
||||
t_content: Optional[datetime] = None
|
||||
version: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
fail_count: int = 0
|
||||
|
||||
def __post_init__(self, entries):
|
||||
self.entries = entries
|
||||
|
||||
def __str__(self):
|
||||
return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'
|
||||
|
||||
async def save(self, conn: Connection):
|
||||
"""
|
||||
Save, trying to merge with existing entry matching on site_id and url.
|
||||
"""
|
||||
if not self.site_id or not self.url:
|
||||
msg = f'Saving feed failed: missing site_id of url'
|
||||
logger.error(msg)
|
||||
return
|
||||
sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
|
||||
self.id_ = await conn.fetchval(sql, self.site_id, self.url)
|
||||
await super().save(conn)
|
||||
|
||||
def debug(self) -> str:
|
||||
"""
|
||||
Return the instance data asa string for debug print output.
|
||||
"""
|
||||
return (
|
||||
f'Feed:\n'
|
||||
f'- id: {self.id_}\n'
|
||||
f'- site_id: {self.site_id}\n'
|
||||
f'- url: {self.url}\n'
|
||||
f'- etag: {self.etag}\n'
|
||||
f'- modified: {self.modified}\n'
|
||||
f'- t_visit: {self.t_visit}\n'
|
||||
f'- t_content: {self.t_content}\n'
|
||||
f'- version: {self.version}\n'
|
||||
f'- title: {self.title}\n'
|
||||
f'- description: {self.description}\n'
|
||||
f'- fail_count: {self.fail_count}\n'
|
||||
f'- entries: {self.entries}'
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Site(ModelBase):
|
||||
"""
|
||||
Website.
|
||||
"""
|
||||
|
||||
table: ClassVar = 'site'
|
||||
base_durl: InitVar[Durl] = None
|
||||
feeds: InitVar[dict] = None
|
||||
links_ext: InitVar[dict] = None
|
||||
links_int: InitVar[dict] = None
|
||||
startpage_text: InitVar[str] = None
|
||||
|
||||
canonical_url: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
base_urls: list[str] = field(default_factory=list)
|
||||
domains: list[str] = field(default_factory=list)
|
||||
ips: Optional[list[str]] = None
|
||||
crawl_enabled: bool = False
|
||||
crawl_active: bool = False
|
||||
next_full_crawl: Optional[datetime] = None
|
||||
next_feed_crawl: Optional[datetime] = None
|
||||
last_update: Optional[datetime] = None
|
||||
last_pub: Optional[datetime] = None
|
||||
pub_dates: Optional[dict[str, str]] = None
|
||||
langs: list[str] = field(default_factory=list)
|
||||
alt_langs: dict[str, str] = field(default_factory=dict)
|
||||
title: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
keywords: list[str] = field(default_factory=list)
|
||||
linkbacks: dict[str, str] = field(default_factory=dict)
|
||||
meta_info: dict = field(default_factory=dict)
|
||||
boilerplate_texts: list[str] = field(default_factory=list)
|
||||
|
||||
def __post_init__(
|
||||
self,
|
||||
base_durl: Durl,
|
||||
feeds=None,
|
||||
links_ext=None,
|
||||
links_int=None,
|
||||
startpage_text=None,
|
||||
):
|
||||
self.feeds = feeds
|
||||
self.links_ext = links_ext
|
||||
self.links_int = links_int
|
||||
self.startpage_text = startpage_text
|
||||
self.keywords = self.keywords[:20]
|
||||
if not self.last_update:
|
||||
self.last_update = datetime.utcnow()
|
||||
pub_date: Optional[str]
|
||||
if self.last_pub:
|
||||
pub_date = date.isoformat(self.last_pub.date())
|
||||
self.pub_dates = {date.isoformat(self.last_update): pub_date}
|
||||
else:
|
||||
pub_date = None
|
||||
self.pub_dates = {}
|
||||
if base_durl:
|
||||
self.base_urls = [base_durl.url()[:200]]
|
||||
self.domains = [extract_domain(base_durl.hostname)[:100]]
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f'Site(id={self.id_}, url={self.base_url},'
|
||||
f' crawl_enabled={self.crawl_enabled})'
|
||||
)
|
||||
|
||||
async def update_base_url(self) -> None:
|
||||
"""
|
||||
Update the base_url, choosing the most relevant URL.
|
||||
|
||||
If canonical_url is not None, use this.
|
||||
Otherwise set self.base_url to the shortest from self.base_urls,
|
||||
but requiring a https-url if there is at least one.
|
||||
"""
|
||||
if self.canonical_url and self.canonical_url not in self.base_urls:
|
||||
if canonical_durl := await Durl(self.canonical_url):
|
||||
self.base_urls.append(self.canonical_url)
|
||||
domain = extract_domain(canonical_durl.hostname)
|
||||
if domain not in self.domains:
|
||||
self.domains.append(domain)
|
||||
if self.canonical_url:
|
||||
self.base_url = self.canonical_url
|
||||
return
|
||||
if not self.base_url:
|
||||
url_candidates = self.base_urls
|
||||
if https_urls := [
|
||||
url for url in self.base_urls if url.startswith('https://')
|
||||
]:
|
||||
url_candidates = https_urls
|
||||
self.base_url = min(url_candidates, key=len)
|
||||
|
||||
async def save( # type: ignore
|
||||
self, conn, merge=True
|
||||
) -> tuple[Optional[int], bool]:
|
||||
"""
|
||||
Store the site, optionally trying to merge it with an existing site.
|
||||
|
||||
Return the id of the saved instance and whether a new instance
|
||||
was created.
|
||||
|
||||
If self.id_ is not 0, replace the data of the existing site with
|
||||
this id. Else if not merge, store as new row, and if merge,
|
||||
try to merge with an existing matching site.
|
||||
"""
|
||||
await self.update_base_url()
|
||||
if not merge:
|
||||
created = not bool(self.id_)
|
||||
await super().save(conn)
|
||||
return self.id_, created
|
||||
if self.id_:
|
||||
sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
|
||||
row = await conn.fetchrow(sql, self.id_)
|
||||
self.base_urls = list(
|
||||
set(row['base_urls']).union(set(self.base_urls))
|
||||
)
|
||||
if previous_pub_dates := row['pub_dates']:
|
||||
if not self.pub_dates:
|
||||
self.pub_dates = {}
|
||||
self.pub_dates.update(previous_pub_dates)
|
||||
await super().save(conn)
|
||||
return self.id_, False
|
||||
same_site_id = await search_same_site(self, conn)
|
||||
if same_site_id:
|
||||
same_site = await Site().load(conn, same_site_id)
|
||||
if same_site_id and same_site:
|
||||
same_site.base_urls = set(same_site.base_urls).union(
|
||||
set(self.base_urls)
|
||||
)
|
||||
same_site.domains = set(same_site.domains).union(set(self.domains))
|
||||
if self.canonical_url and not same_site.canonical_url:
|
||||
same_site.canonical_url = self.canonical_url
|
||||
await same_site.save(conn, merge=False) # call ourselves
|
||||
self.id_ = same_site.id_
|
||||
return self.id_, False
|
||||
else:
|
||||
await super().save(conn)
|
||||
return self.id_, True
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitePath(ModelBase):
|
||||
"""
|
||||
Path of a website. May point to a Resource.
|
||||
"""
|
||||
|
||||
table: ClassVar = 'site_path'
|
||||
site: InitVar[str] = None
|
||||
|
||||
site_id: Optional[int] = None
|
||||
path: Optional[str] = None
|
||||
filtered: bool = False
|
||||
last_visit: Optional[datetime] = None
|
||||
ok_count: int = 0
|
||||
canonical: Optional[bool] = None
|
||||
resource_id: Optional[int] = None
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
f'SitePath(id={self.id_}, site_id={self.site_id},'
|
||||
f' path={self.path})'
|
||||
)
|
||||
|
||||
async def save(self, conn: Connection):
|
||||
"""
|
||||
Save the instance, extending the parent's method.
|
||||
"""
|
||||
self.path = self.path[:400] if self.path else ''
|
||||
await super().save(conn)
|
||||
|
||||
async def unlink_resource(self, conn, engine, index_base_name):
|
||||
"""
|
||||
Unlink the resource and also delete it, if it has no more links.
|
||||
"""
|
||||
if self.id_:
|
||||
if self.resource_id:
|
||||
sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
|
||||
ref_count = await conn.fetchval(sql, self.resource_id)
|
||||
if ref_count == 0:
|
||||
sql = (
|
||||
"DELETE FROM resource WHERE id=$1"
|
||||
" RETURNING (true, lang)"
|
||||
)
|
||||
found = await conn.fetchval(sql, self.resource_id)
|
||||
if found:
|
||||
await delete_resource(
|
||||
engine, found[1], self.resource_id
|
||||
)
|
||||
self.resource_id = None
|
||||
|
||||
def url(self, site):
|
||||
"""
|
||||
Return the full URL (combine the site's base_url with our path).
|
||||
"""
|
||||
return site.base_url + self.path
|
||||
|
||||
|
||||
@dataclass
|
||||
class Crawl(ModelBase):
|
||||
"""
|
||||
The crawl process of a website (begin, end, statistics, ...).
|
||||
"""
|
||||
|
||||
table: ClassVar = 'crawl'
|
||||
site_id: Optional[int] = None
|
||||
is_full: bool = False
|
||||
t_begin: datetime = datetime.utcnow()
|
||||
t_end: Optional[datetime] = None
|
||||
n_resources: int = 0
|
||||
n_resources_new: int = 0
|
||||
|
||||
async def finish(self, conn, set_t_end):
|
||||
"""
|
||||
Save the crawl. Set t_end only if indicated.
|
||||
"""
|
||||
if set_t_end:
|
||||
self.t_end = datetime.utcnow()
|
||||
await self.save(conn)
|
||||
|
||||
|
||||
async def search_same_site(
|
||||
site: Site,
|
||||
conn: Connection,
|
||||
) -> Optional[int]:
|
||||
"""
|
||||
Try to find a matching site for the given *site* and return its id.
|
||||
|
||||
TODO: if the path is non-trivial, require it also for the matching site
|
||||
|
||||
Two sites match when they return the same content for identical paths.
|
||||
The base_url (scheme and/or netloc) may differ.
|
||||
We do not have the content for all paths of both websites, so we need
|
||||
to estimate: We only take into account meta information from the
|
||||
start pages of both sites, in particular the title, description
|
||||
and information obtained the base_urls:
|
||||
|
||||
We use a combination of these conditions:
|
||||
|
||||
1. one of the sites has a canonical URL which matches the
|
||||
URL of the other site
|
||||
2. the content fields (title, description) have sufficient information
|
||||
3. the content fields match exactly
|
||||
4. the domain matches
|
||||
5. the domain matches, except for the TLD
|
||||
6. the base_urls differ in their schemes (http vs. https)
|
||||
7. the hostnames in the base_urls are identical
|
||||
8. the hostnames in the base_urls differ by a prepended 'www.'
|
||||
9. the IPs have at least one common address
|
||||
|
||||
The algorithm is this (first answer is final, yes means match):
|
||||
|
||||
* if (1) : yes
|
||||
* if (2), (3), (4) : yes
|
||||
* if (2), (3), (5), (9) : yes
|
||||
* if (6), ((7) or (8)) : yes
|
||||
* no
|
||||
"""
|
||||
# rule (1)
|
||||
if site.canonical_url:
|
||||
sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
|
||||
id_ = await conn.fetchval(sql, site.canonical_url)
|
||||
if id_:
|
||||
return id_
|
||||
else:
|
||||
sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
|
||||
id_ = await conn.fetchval(sql, site.base_urls)
|
||||
if id_:
|
||||
return id_
|
||||
|
||||
# rule (6), ((7) or (8))
|
||||
url_variants = set(
|
||||
chain.from_iterable(
|
||||
get_url_variants(base_url) for base_url in site.base_urls
|
||||
)
|
||||
)
|
||||
sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
|
||||
if id_ := await conn.fetchval(sql, url_variants):
|
||||
return id_
|
||||
|
||||
# condition (2)
|
||||
if len(site.title or '') > 15 or len(site.description or '') > 15:
|
||||
sql = (
|
||||
f"SELECT * FROM site WHERE"
|
||||
f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
|
||||
)
|
||||
rows = await conn.fetch(sql, site.title or '', site.description or '')
|
||||
# condition (3)
|
||||
if rows:
|
||||
# condition (4)
|
||||
for row in rows:
|
||||
domains = set(row.get('domains', []))
|
||||
if domains & set(site.domains):
|
||||
return row['id']
|
||||
# condition (9)
|
||||
for row in rows:
|
||||
ips = set(row.get('ips', []))
|
||||
if site.ips and ips & set(site.ips):
|
||||
# condition (5)
|
||||
domains_ = row.get('domains', [])
|
||||
d1 = set([tldextract.extract(d).domain for d in domains_])
|
||||
domains_ = site.domains or []
|
||||
d2 = set([tldextract.extract(d).domain for d in domains_])
|
||||
if d1 & d2:
|
||||
return row['id']
|
||||
|
||||
return None
|
0
src/atextcrawler/plugin_defaults/__init__.py
Normal file
0
src/atextcrawler/plugin_defaults/__init__.py
Normal file
22
src/atextcrawler/plugin_defaults/filter_resource_path.py
Normal file
22
src/atextcrawler/plugin_defaults/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Filter paths found in a resource.
|
||||
|
||||
This plugin implements :func:`rp_filter`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def rp_filter(site, durl) -> Optional[str]:
|
||||
"""
|
||||
Adjust or filter found paths (may depend on site).
|
||||
|
||||
To filter out a path (i.e., not add it to table `site_path`)
|
||||
return None.
|
||||
"""
|
||||
path = durl.pwa()
|
||||
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||
return None
|
||||
path = path.removesuffix('?amp=1')
|
||||
return path
|
47
src/atextcrawler/plugin_defaults/filter_site.py
Normal file
47
src/atextcrawler/plugin_defaults/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Relevance estimation of sites.
|
||||
|
||||
This plugin implements :func:`site_filter`.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from atextcrawler.models import Site
|
||||
|
||||
MIN_RELEVANCE_SCORE = 5
|
||||
|
||||
|
||||
async def site_filter(site: Site) -> bool:
|
||||
"""
|
||||
Assess relevance of the site (using language-dependent criteria).
|
||||
|
||||
If the site shall be crawled, return True, else False.
|
||||
"""
|
||||
# limit to sites in English or German language
|
||||
if not set(['de', 'en']) & set(site.langs):
|
||||
return False
|
||||
score = 0.0
|
||||
for crit_name, weight, langs, crit_re in re_criteria:
|
||||
if '*' in langs or set(langs) & set(site.langs):
|
||||
findings = crit_re.findall(site.startpage_text)
|
||||
if findings:
|
||||
score += weight * len(findings)
|
||||
if site.title and crit_re.search(site.title):
|
||||
score += 4 * weight
|
||||
if site.description and crit_re.search(site.description):
|
||||
score += 4 * weight
|
||||
|
||||
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||
|
||||
return score >= MIN_RELEVANCE_SCORE
|
||||
|
||||
|
||||
re_criteria = {
|
||||
(
|
||||
'anarch',
|
||||
1.0,
|
||||
('*',),
|
||||
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||
),
|
||||
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||
}
|
24
src/atextcrawler/plugin_defaults/filter_site_path.py
Normal file
24
src/atextcrawler/plugin_defaults/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Plugin for filtering paths of a site to be retrieved.
|
||||
|
||||
This plugin implements :func:`sp_filter`.
|
||||
"""
|
||||
|
||||
|
||||
def sp_filter(site, path, robots) -> bool:
|
||||
"""
|
||||
Per-site path filter. Return whether the path shall be retrieved.
|
||||
"""
|
||||
if not robots.can_fetch_url(site.base_url + path):
|
||||
return False
|
||||
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||
if any(
|
||||
[
|
||||
path.endswith(end)
|
||||
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||
]
|
||||
):
|
||||
return False
|
||||
if '/bbselect?' in path:
|
||||
return False
|
||||
return True
|
10
src/atextcrawler/resource/__init__.py
Normal file
10
src/atextcrawler/resource/__init__.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
from .dedup import store_boilerplate_texts
|
||||
from .feed import feed_types, update_feed
|
||||
from .fetch import ResourceFetcher
|
||||
from .operations import (
|
||||
add_site_paths,
|
||||
get_site_path,
|
||||
process_site_path,
|
||||
store_feed_entries,
|
||||
)
|
||||
from .sitemap import extract_sitemap_paths, get_sitemap_urls
|
96
src/atextcrawler/resource/__main__.py
Normal file
96
src/atextcrawler/resource/__main__.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
"""
|
||||
Dev tool for fetching and displaying a resource.
|
||||
|
||||
Has no permanent effects.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from pprint import pformat
|
||||
|
||||
import aiohttp
|
||||
|
||||
from ..models import Feed, TextResource
|
||||
from ..resource import ResourceFetcher
|
||||
from ..utils.annotation import pack_annotations, unpack_annotations
|
||||
from ..utils.durl import Durl
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.addHandler(logging.StreamHandler())
|
||||
logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
|
||||
logger_page_debug.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
def add_tags(text, annotations):
|
||||
"""
|
||||
Reconstruct html from text and annotations.
|
||||
|
||||
This is very similar to what the client does when displaying
|
||||
a cached hit.
|
||||
"""
|
||||
html = ''
|
||||
opening_tags = defaultdict(list)
|
||||
closing_tags = defaultdict(list)
|
||||
anns_tags = sorted(
|
||||
annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
|
||||
)
|
||||
for (i, f), anns in anns_tags:
|
||||
opening_tags[i] += [tag for tag in reversed(anns)]
|
||||
closing_tags[f] += [tag for tag in reversed(anns)]
|
||||
positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
|
||||
last_pos = 0
|
||||
links = {i: href for href, (i, f, rel) in annotations['links'].items()}
|
||||
for pos in positions:
|
||||
html += text[last_pos:pos]
|
||||
closing = closing_tags.get(pos, [])
|
||||
opening = opening_tags.get(pos, [])
|
||||
common = set(closing) & set(opening)
|
||||
closing = [tag for tag in closing if tag not in common]
|
||||
opening = [tag for tag in opening if tag not in common]
|
||||
tags_html = ''
|
||||
for tag in reversed(closing):
|
||||
html += f'</{tag}>\n'
|
||||
for tag in opening:
|
||||
if tag == 'a':
|
||||
href = links.get(pos, '#')
|
||||
html += f'<a href="{href}">'
|
||||
else:
|
||||
html += f'<{tag}>'
|
||||
last_pos = pos
|
||||
return html
|
||||
|
||||
|
||||
async def run():
|
||||
"""
|
||||
Fetch and display a resource with URL given as cmdline argument.
|
||||
"""
|
||||
url = sys.argv[1]
|
||||
async with aiohttp.ClientSession() as session:
|
||||
if not (durl := await Durl(url)):
|
||||
return
|
||||
fetcher = ResourceFetcher(session)
|
||||
resource = await fetcher.fetch(url)
|
||||
if isinstance(resource, TextResource):
|
||||
logger.warning(repr(resource))
|
||||
logger.warning(f'Language: {resource.lang}')
|
||||
logger.warning(pformat(resource.search_fields))
|
||||
logger.warning(pformat(resource.init_fields))
|
||||
|
||||
# annotations = resource.search_fields.get('annotations')
|
||||
# text = resource.search_fields['text']
|
||||
# with open('/tmp/1.html', 'w') as f:
|
||||
# html = add_tags(text, annotations)
|
||||
# f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
|
||||
# f'<body>\n{html}\n</body></html>')
|
||||
elif isinstance(resource, Feed):
|
||||
logger.warning(resource.debug())
|
||||
else:
|
||||
logger.warning(f'Resource has type {type(resource)}')
|
||||
logger.warning(resource)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(run())
|
59
src/atextcrawler/resource/dedup.py
Normal file
59
src/atextcrawler/resource/dedup.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
"""
|
||||
Find boilerplate texts.
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from ..models import TextResource
|
||||
from ..utils.probe import extract_samples
|
||||
from ..utils.section import iter_sections
|
||||
|
||||
|
||||
async def store_boilerplate_texts(fetcher, conn, site):
|
||||
"""
|
||||
Find and store boilerplate texts of a site.
|
||||
|
||||
Fetch the start page and internal sample links obtained from it.
|
||||
If there are sufficienty frequently appearing text sections,
|
||||
consider them as boilerplate texts.
|
||||
|
||||
If boilerplate_texts were found, update the given site instance.
|
||||
"""
|
||||
startpage = await fetcher.fetch(site.base_url, site=site)
|
||||
if (
|
||||
not isinstance(startpage, TextResource)
|
||||
or startpage.content_type != 'html'
|
||||
):
|
||||
return
|
||||
|
||||
# fetch sample resources
|
||||
sample_links = extract_samples(startpage.init_fields['links_int'])
|
||||
resources = [startpage]
|
||||
for sample_link in sample_links:
|
||||
if sample_link.path == site.base_url: # avoid duplicate resources
|
||||
continue # NB: duplicate resources may have different paths
|
||||
sample_resource = await fetcher.fetch(sample_link.url(), site=None)
|
||||
if (
|
||||
isinstance(sample_resource, TextResource)
|
||||
and sample_resource.content_type == 'html'
|
||||
):
|
||||
resources.append(sample_resource)
|
||||
|
||||
# find common texts in resources
|
||||
if (n_resources := len(resources)) > 2:
|
||||
text_freq = Counter()
|
||||
for resource in resources:
|
||||
text = resource.search_fields['text']
|
||||
semantic_breaks = resource.search_fields['annotations'][
|
||||
'semantic_breaks'
|
||||
]
|
||||
for sec in iter_sections(text, semantic_breaks):
|
||||
text_freq[sec[3]] += 1
|
||||
boilerplate_texts = []
|
||||
if min(text_freq.values() or [0]) == 1: # no resource fetched twice
|
||||
for text, freq in text_freq.items():
|
||||
if freq > 2:
|
||||
boilerplate_texts.append(text)
|
||||
sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
|
||||
await conn.execute(sql, boilerplate_texts, site.id_)
|
||||
site.boilerplate_texts = boilerplate_texts
|
131
src/atextcrawler/resource/document.py
Normal file
131
src/atextcrawler/resource/document.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
"""
|
||||
Parse documents (often application/pdf).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional, Union
|
||||
|
||||
from tika import parser
|
||||
|
||||
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
||||
from ..utils.durl import Durl
|
||||
from ..utils.http import get_header_links
|
||||
from ..utils.lang import extract_content_language
|
||||
from .plaintext import annotate_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_debug = logging.getLogger(__name__ + '.debug')
|
||||
logger_debug.setLevel(logging.INFO)
|
||||
|
||||
|
||||
re_url = re.compile(
|
||||
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
|
||||
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
|
||||
)
|
||||
|
||||
|
||||
async def parse_document(
|
||||
durl: Durl,
|
||||
resp: dict,
|
||||
site: Optional[Site],
|
||||
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
|
||||
"""
|
||||
Extract plain text from documents in various formats.
|
||||
"""
|
||||
content = resp['content']
|
||||
|
||||
# HTTP headers, canonical URL, shortlink
|
||||
header_links = await get_header_links(resp['headers'], durl, site)
|
||||
if canonical := header_links.get('canonical'):
|
||||
if canonical != durl.url():
|
||||
return ResourceRedirect(resp['redirects'] + [canonical])
|
||||
shortlink = header_links.get('shortlink')
|
||||
|
||||
# use tika to extract text
|
||||
doc = parser.from_buffer(content)
|
||||
# logger.debug(pformat(doc))
|
||||
if doc.get('status') != 200:
|
||||
msg = f'Analyzing document failed: {durl.url()}'
|
||||
return ResourceError(msg)
|
||||
|
||||
# collect meta data
|
||||
meta = doc.get('metadata', {})
|
||||
content_type = meta.get('Content-Type')
|
||||
if isinstance(content_type, list):
|
||||
content_type = content_type[-1]
|
||||
title = concat(meta.get('title'))
|
||||
concat(meta.get('creator'))
|
||||
last_change = extract_latest(meta.get('date') or meta.get('created'))
|
||||
keywords = None
|
||||
|
||||
# text content
|
||||
text = (doc.get('content') or '').strip()
|
||||
|
||||
# links
|
||||
links_int: dict[Durl, tuple[list[str], str]] = {}
|
||||
links_ext: dict[Durl, tuple[list[str], str]] = {}
|
||||
for url in re_url.findall(text):
|
||||
link_durl = await Durl(url[0])
|
||||
if link_durl:
|
||||
if link_durl.site() == durl.site():
|
||||
links_int[link_durl] = [], link_durl.url()
|
||||
else:
|
||||
links_ext[link_durl] = [], link_durl.url()
|
||||
|
||||
# annotations
|
||||
text, annotations = annotate_text(text)
|
||||
|
||||
return TextResource(
|
||||
content_type=content_type,
|
||||
last_change=last_change,
|
||||
text_len=len(text),
|
||||
lang=extract_content_language(text),
|
||||
title=title,
|
||||
init_fields={
|
||||
'durl': durl,
|
||||
'site': site,
|
||||
'headers': resp['headers'],
|
||||
'redirects': resp['redirects'],
|
||||
'links_int': links_int,
|
||||
'links_ext': links_ext,
|
||||
'shortlink': shortlink,
|
||||
'canonical': None,
|
||||
},
|
||||
search_fields={
|
||||
'title': title,
|
||||
'pub_date': last_change,
|
||||
'keywords': keywords,
|
||||
'text': text,
|
||||
'annotations': annotations,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
|
||||
"""
|
||||
Extract the lastest date (if any) from a string or list of strings.
|
||||
"""
|
||||
if not s:
|
||||
return None
|
||||
if not isinstance(s, list):
|
||||
s = [s]
|
||||
dt = []
|
||||
for t in s:
|
||||
try:
|
||||
dt.append(datetime.fromisoformat(t.rstrip('Z')))
|
||||
except:
|
||||
pass
|
||||
return max(dt) if dt else None
|
||||
|
||||
|
||||
def concat(s: Optional[Union[str, list]]) -> Optional[str]:
|
||||
"""
|
||||
Helper function for joining strings together.
|
||||
"""
|
||||
if not s:
|
||||
return None
|
||||
if not isinstance(s, list):
|
||||
s = [s]
|
||||
return ' '.join(s)
|
155
src/atextcrawler/resource/feed.py
Normal file
155
src/atextcrawler/resource/feed.py
Normal file
|
@ -0,0 +1,155 @@
|
|||
"""
|
||||
Stuff related to feeds.
|
||||
|
||||
Higher-level stuff is in site.feeds.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Union
|
||||
|
||||
from asyncpg import Connection
|
||||
from feedparser import parse
|
||||
|
||||
from ..models import Feed, MetaResource, ResourceError
|
||||
from ..utils.durl import Durl
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
feed_types = (
|
||||
'application/rss+xml',
|
||||
'application/atom+xml',
|
||||
'application/feed+json',
|
||||
)
|
||||
|
||||
|
||||
async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
|
||||
"""
|
||||
Fetch, parse and return a given feed's content. Also update *feed*.
|
||||
|
||||
If the server replied with HTTP 410, delete the feed.
|
||||
If there is no new information (server replied with HTTP 304),
|
||||
return None. For other errors also return None and increase the
|
||||
fail_count.
|
||||
"""
|
||||
headers = {'Cache-control': 'max-age=600'}
|
||||
if feed.modified:
|
||||
headers['If-Modified-Since'] = feed.modified
|
||||
elif feed.etag:
|
||||
headers['If-None-Match'] = feed.etag.removeprefix('W/')
|
||||
resource = await fetcher.fetch(feed.url, headers=headers)
|
||||
if isinstance(resource, ResourceError):
|
||||
if resource.status == 410:
|
||||
msg = f'Feed has vanished, deleting it: {feed}'
|
||||
logger.debug(msg)
|
||||
await feed.delete(conn)
|
||||
if resource.status != 304:
|
||||
feed.fail_count += 1
|
||||
if feed.fail_count > 5:
|
||||
msg = f'Feed not reachable, deleting it: {feed}'
|
||||
logger.debug(msg)
|
||||
await feed.delete(conn)
|
||||
return None # HTTP 304, no new entries
|
||||
elif isinstance(resource, Feed):
|
||||
resource.id_ = feed.id_
|
||||
resource.site_id = feed.site_id
|
||||
await resource.save(conn)
|
||||
return resource.entries
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_feed(resp, data: dict) -> Feed:
|
||||
"""
|
||||
Parse a JSON response for jsonfeed information.
|
||||
|
||||
TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
|
||||
"""
|
||||
feed = Feed()
|
||||
feed.url = data.get('feed_url', resp['redirects'][-1])
|
||||
feed.etag = resp['headers'].get('ETag')
|
||||
feed.modified = resp['headers'].get('Last-Modified')
|
||||
feed.t_visit = datetime.utcnow()
|
||||
version = data.get('version', '')
|
||||
version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
|
||||
feed.version = version[:10]
|
||||
feed.title = data.get('title')
|
||||
feed.description = data.get('description')
|
||||
feed.fail_count = 0
|
||||
entries = []
|
||||
latest = None
|
||||
# parse feed entries to a dict compatible with feedparser's entries
|
||||
for feed_item in data.get('items', []):
|
||||
entry = {}
|
||||
entry['link'] = feed_item.get('url')
|
||||
dt = feed_item.get('date_published')
|
||||
if dt:
|
||||
dt = datetime.fromisoformat(dt) if dt else None
|
||||
dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
|
||||
entry['published_parsed'] = dt.timetuple()
|
||||
entry['title'] = feed_item.get('title')
|
||||
entry['summary'] = feed_item.get('summary')
|
||||
entries.append(entry)
|
||||
if dt:
|
||||
latest = max(latest or dt, dt)
|
||||
feed.entries = entries
|
||||
feed.t_content = latest
|
||||
return feed
|
||||
|
||||
|
||||
def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
|
||||
"""
|
||||
Parse a response from Fetcher.get_resp() for xml feed information.
|
||||
"""
|
||||
feed = Feed()
|
||||
feed.url = resp['redirects'][-1]
|
||||
feed.etag = resp['headers'].get('ETag')
|
||||
feed.modified = resp['headers'].get('Last-Modified')
|
||||
feed.t_visit = datetime.utcnow()
|
||||
try:
|
||||
parsed = parse(resp['content'], response_headers=resp['headers'])
|
||||
except Exception as error:
|
||||
return ResourceError(f'Feedparser error: {error}')
|
||||
latest = parsed['feed'].get('updated_parsed')
|
||||
if latest:
|
||||
latest = datetime(*latest[:6])
|
||||
feed.t_content = max(feed.t_content or latest, latest)
|
||||
feed.version = parsed['version']
|
||||
feed.title = parsed['feed'].get('title', '')[:200] or None
|
||||
feed.description = parsed['feed'].get('description')
|
||||
feed.fail_count = 0
|
||||
feed.entries = parsed['entries']
|
||||
return feed
|
||||
|
||||
|
||||
def convert_feed_entries(
|
||||
base_url: Optional[str],
|
||||
entries: list[dict],
|
||||
) -> tuple[
|
||||
list[tuple[str, bool]],
|
||||
dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
|
||||
]:
|
||||
"""
|
||||
Extract paths and resource meta information from a feed's entries.
|
||||
|
||||
Return paths in a structure wanted by :func:`add_site_paths` and
|
||||
resource meta information in a structure wanted by
|
||||
:func:`update_resource_meta`.
|
||||
"""
|
||||
paths = []
|
||||
resource_meta = {}
|
||||
for entry in entries:
|
||||
if entry.get('link') and entry['link'].startswith(base_url or ''):
|
||||
path = entry['link'].removeprefix(base_url or '').lstrip('/')
|
||||
if len(path) <= 200:
|
||||
last_update = entry.get('published_parsed')
|
||||
if last_update:
|
||||
last_update = datetime(*last_update[:6])
|
||||
paths.append((path, True))
|
||||
resource_meta[path] = (
|
||||
last_update,
|
||||
entry.get('title', '')[:200] or None,
|
||||
entry.get('summary', '')[:2000] or None,
|
||||
)
|
||||
return paths, resource_meta
|
327
src/atextcrawler/resource/fetch.py
Normal file
327
src/atextcrawler/resource/fetch.py
Normal file
|
@ -0,0 +1,327 @@
|
|||
"""
|
||||
Access to a resource specified by a URL.
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import logging
|
||||
from json import loads
|
||||
from traceback import format_exc
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import aiohttp
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from ..models import (
|
||||
Feed,
|
||||
MetaResource,
|
||||
ResourceError,
|
||||
ResourceRedirect,
|
||||
Site,
|
||||
TextResource,
|
||||
)
|
||||
from ..utils.durl import Durl
|
||||
from ..utils.link import in_blacklist
|
||||
from .document import parse_document
|
||||
from .feed import parse_json_feed, parse_xml_feed
|
||||
from .page import parse_html
|
||||
from .plaintext import parse_plaintext
|
||||
from .sitemap import parse_sitemap, parse_sitemapindex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MAX_REDIRECTS = 10
|
||||
"""
|
||||
Maximum number of redirects to follow.
|
||||
"""
|
||||
|
||||
|
||||
default_headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
|
||||
' Gecko/20100101 Firefox/78.0',
|
||||
'DNT': '1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
|
||||
}
|
||||
"""
|
||||
Default HTTP client headers, overwriting those of aiohttp.ClientSession.
|
||||
"""
|
||||
|
||||
|
||||
blacklist_content_types = [
|
||||
'',
|
||||
'application/ogg',
|
||||
]
|
||||
"""
|
||||
Blacklist for content-types.
|
||||
"""
|
||||
|
||||
|
||||
text_content_types = {
|
||||
'text/html': 'html',
|
||||
'text/plain': 'plain',
|
||||
'application/rss+xml': 'feed-rss',
|
||||
'application/atom+xml': 'feed-atom',
|
||||
'application/feed+json': 'feed-json',
|
||||
'application/json': 'json',
|
||||
'application/xml': 'xml',
|
||||
'text/xml': 'xml',
|
||||
}
|
||||
"""
|
||||
Map content-types to parsers.
|
||||
"""
|
||||
|
||||
|
||||
class ResourceFetcher:
|
||||
"""
|
||||
Fetch a resource specified by a URL (:meth:`fetch`).
|
||||
|
||||
The timeout is the same for all requests.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
session: aiohttp.ClientSession,
|
||||
timeout_sock_connect: Union[int, float] = 8,
|
||||
timeout_sock_read: Union[int, float] = 30,
|
||||
):
|
||||
self.session = session
|
||||
self.timeout = aiohttp.ClientTimeout(
|
||||
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
|
||||
)
|
||||
|
||||
async def fetch(
|
||||
self,
|
||||
url: str,
|
||||
site: Optional[Site] = None,
|
||||
redirect_history: Optional[list[str]] = None,
|
||||
headers: Optional[dict] = None,
|
||||
) -> Union[
|
||||
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
||||
]:
|
||||
"""
|
||||
Try to fetch a resource and return an instance or error or redirect.
|
||||
|
||||
If an error was encountered, return a ResourceError.
|
||||
If the resource has an irrelevant content type, return None.
|
||||
Otherwise return a specific content instance.
|
||||
|
||||
Argument *redirect_history* contains the redirect history;
|
||||
if one of the redirects is encountered again, return None.
|
||||
"""
|
||||
if redirect_history is None:
|
||||
redirect_history = []
|
||||
if not (durl := await Durl(url)):
|
||||
return ResourceError('Invalid URL')
|
||||
resp = await self.get_resp(
|
||||
durl,
|
||||
redirect_history=redirect_history,
|
||||
headers=headers,
|
||||
)
|
||||
if isinstance(resp, ResourceError):
|
||||
return resp
|
||||
if resp is None:
|
||||
return None
|
||||
result = await self._parse(durl, site, resp)
|
||||
if isinstance(result, (MetaResource, TextResource)):
|
||||
result.id_ = None
|
||||
return result
|
||||
|
||||
async def _parse(
|
||||
self, durl, site, resp, in_recursion=False
|
||||
) -> Union[
|
||||
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
||||
]:
|
||||
"""
|
||||
Parse a response. May call itself.
|
||||
"""
|
||||
result: Union[
|
||||
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
||||
] = None
|
||||
content = resp['content']
|
||||
if isinstance(content, str) and content.startswith('<?xml '):
|
||||
result = await parse_xml(durl, resp)
|
||||
elif resp['parser'] == 'feed-rss':
|
||||
result = await parse_xml(durl, resp, rss=True)
|
||||
elif resp['parser'] == 'feed-atom':
|
||||
result = await parse_xml(durl, resp, atom=True)
|
||||
elif resp['parser'] == 'xml':
|
||||
result = await parse_xml(durl, resp)
|
||||
elif resp['parser'] == 'html':
|
||||
result = await parse_html(durl, resp, site)
|
||||
elif resp['parser'] in ('json', 'feed-json'):
|
||||
result = await parse_json(durl, resp)
|
||||
elif resp['parser'] == 'plain':
|
||||
result = await parse_plaintext(durl, resp, site)
|
||||
elif resp['parser'] == 'application':
|
||||
if resp['headers'].get('content-type') == 'application/x-gzip':
|
||||
if in_recursion:
|
||||
return None # consider nested gzip an attack
|
||||
resp['content'] = gzip.decompress(resp['content'])
|
||||
return await self._parse(durl, site, resp, in_recursion=True)
|
||||
result = await parse_document(durl, resp, site)
|
||||
if isinstance(result, ResourceRedirect):
|
||||
redir_url = result.urls[-1]
|
||||
result = await self.fetch(
|
||||
redir_url,
|
||||
site=site,
|
||||
redirect_history=result.urls[:-1],
|
||||
)
|
||||
return result
|
||||
|
||||
async def get_resp(
|
||||
self,
|
||||
durl: Durl,
|
||||
headers: dict = None,
|
||||
redirect_history: Optional[list[str]] = None,
|
||||
) -> Optional[Union[ResourceError, dict]]:
|
||||
"""
|
||||
Try to fetch a url returning a ResourceError or a dict with content.
|
||||
|
||||
Optional *headers* will overwrite the :var:`default_headers`.
|
||||
|
||||
If the response status is not 200, always return an ResourceError.
|
||||
|
||||
If the content-type is not relevant (see blacklist_content_types),
|
||||
return None.
|
||||
|
||||
The dict contains these keys+values:
|
||||
|
||||
* 'parser': a hint on the parser to use for analyzing the content;
|
||||
one of 'html', 'plain', 'feed', 'xml', 'application'
|
||||
* 'content': bytes for type application, otherwise str
|
||||
* 'redirects': a list of URLs visited during HTTP redirection,
|
||||
the last item is the final URL
|
||||
* 'headers': response headers
|
||||
"""
|
||||
if redirect_history is None:
|
||||
redirect_history = []
|
||||
if len(redirect_history) >= MAX_REDIRECTS:
|
||||
return None
|
||||
headers_ = default_headers.copy()
|
||||
if headers:
|
||||
headers_.update(headers)
|
||||
try:
|
||||
async with self.session.get(
|
||||
durl.url(),
|
||||
headers=headers_,
|
||||
timeout=self.timeout,
|
||||
) as resp:
|
||||
redirects = [durl.url()]
|
||||
if resp.history:
|
||||
href = resp.history[-1].headers.get('location')
|
||||
if not href or not (redurl := await Durl(href, base=durl)):
|
||||
msg = 'Invalid URL after HTTP redirect'
|
||||
return ResourceError(msg)
|
||||
if in_blacklist(redurl.hostname):
|
||||
src_url = (
|
||||
redirect_history[0]
|
||||
if redirect_history
|
||||
else durl.url()
|
||||
)
|
||||
msg = (
|
||||
f'Dropping URL {src_url}, since'
|
||||
f' redirected to a blacklisted site'
|
||||
)
|
||||
logger.debug(msg)
|
||||
return None
|
||||
redirects = [str(r.url) for r in resp.history]
|
||||
redirects.append(redurl.url())
|
||||
if join := set(redirect_history) & set(redirects):
|
||||
msg = f'Cyclic redirect {join}'
|
||||
return ResourceError(msg)
|
||||
if resp.status != 200:
|
||||
msg = f'HTTP status {resp.status}'
|
||||
return ResourceError(
|
||||
msg, status=resp.status, headers=headers
|
||||
)
|
||||
c_type = resp.headers.get('content-type', '').split(';')[0]
|
||||
if c_type in blacklist_content_types:
|
||||
return None
|
||||
result: dict[str, Any] = {
|
||||
'redirects': redirect_history + redirects,
|
||||
'headers': resp.headers,
|
||||
}
|
||||
if c_type in text_content_types.keys():
|
||||
try: # catch decoding issues
|
||||
content = await resp.text()
|
||||
except:
|
||||
body = await resp.read()
|
||||
encoding = resp.charset or 'utf-8'
|
||||
encoding = encoding.replace('CP-1250', 'cp1250')
|
||||
content = body.decode(encoding, errors='replace')
|
||||
result['content'] = content
|
||||
result['parser'] = text_content_types[c_type]
|
||||
return result
|
||||
elif c_type.startswith('application/'):
|
||||
result['content'] = await resp.read()
|
||||
result['parser'] = 'application'
|
||||
return result
|
||||
except aiohttp.ClientError as error:
|
||||
# on certificate error try without tls
|
||||
if 'SSLCertVerificationError' in str(error):
|
||||
if durl.scheme == 'https':
|
||||
url = durl.url()
|
||||
durl.replace_scheme('http')
|
||||
response = await self.get_resp(
|
||||
durl=durl,
|
||||
headers=headers,
|
||||
redirect_history=redirect_history + [url],
|
||||
)
|
||||
if not isinstance(response, ResourceError):
|
||||
return response
|
||||
msg = f'ClientError: {error}'
|
||||
return ResourceError(msg)
|
||||
except Exception as error:
|
||||
msg = f'Unknown error: {error}:\n{format_exc()}'
|
||||
logger.error(msg)
|
||||
return ResourceError(msg)
|
||||
return None
|
||||
|
||||
|
||||
async def parse_xml(
|
||||
durl: Durl,
|
||||
response: dict,
|
||||
rss=False,
|
||||
atom=False,
|
||||
) -> Optional[Union[MetaResource, ResourceError]]:
|
||||
"""
|
||||
Parse XML content.
|
||||
|
||||
In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
|
||||
"""
|
||||
try:
|
||||
xml = response['content']
|
||||
soup = BeautifulSoup(xml, 'html.parser')
|
||||
except:
|
||||
return None
|
||||
if rss or (rss := soup.find('rss')):
|
||||
return parse_xml_feed(response)
|
||||
elif atom or (atom := soup.find('atom')):
|
||||
return parse_xml_feed(response)
|
||||
elif sitemapindex := soup.find('sitemapindex'):
|
||||
return parse_sitemapindex(sitemapindex)
|
||||
elif urlset := soup.find('urlset'):
|
||||
return parse_sitemap(urlset)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
async def parse_json(
|
||||
durl: Durl,
|
||||
response: dict,
|
||||
) -> Optional[Union[Feed, ResourceError]]:
|
||||
"""
|
||||
Parse the content of JSON feeds.
|
||||
"""
|
||||
try:
|
||||
data = loads(response['content'])
|
||||
except:
|
||||
msg = f'Could not parse JSON from {durl.url()}'
|
||||
logger.debug(msg)
|
||||
return None
|
||||
if not isinstance(data, dict):
|
||||
return None
|
||||
if data.get('version', '').startswith('https://jsonfeed.org/'):
|
||||
return parse_json_feed(response, data)
|
||||
return None
|
347
src/atextcrawler/resource/operations.py
Normal file
347
src/atextcrawler/resource/operations.py
Normal file
|
@ -0,0 +1,347 @@
|
|||
"""
|
||||
Operations on resources.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from asyncpg import Connection
|
||||
|
||||
from ..models import (
|
||||
Feed,
|
||||
MetaResource,
|
||||
ResourceError,
|
||||
Site,
|
||||
Sitemap,
|
||||
SitemapIndex,
|
||||
SitePath,
|
||||
TextResource,
|
||||
)
|
||||
from ..search import delete_resource, index_resource
|
||||
from ..tensorflow import TensorFlow
|
||||
from ..utils.durl import Durl
|
||||
from ..utils.similarity import (
|
||||
create_simhash,
|
||||
search_simhash,
|
||||
simhash_from_bigint,
|
||||
simhash_to_bigint,
|
||||
)
|
||||
from .feed import convert_feed_entries
|
||||
from .fetch import ResourceFetcher
|
||||
from .sitemap import extract_sitemap_paths
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def add_site_paths(
|
||||
conn: Connection,
|
||||
site_id: int,
|
||||
paths: Sequence[tuple[str, Optional[bool]]],
|
||||
) -> None:
|
||||
"""
|
||||
Add site paths. if resource infos are given, also create resources.
|
||||
|
||||
The paths must be given as relative paths and together with a boolean
|
||||
telling whether the link is a canonical link.
|
||||
"""
|
||||
sql = (
|
||||
"INSERT INTO site_path (site_id, path, canonical)"
|
||||
" VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING"
|
||||
)
|
||||
values = (
|
||||
(site_id, path, canonical)
|
||||
for path, canonical in paths[:100000]
|
||||
if len(path) <= 400
|
||||
)
|
||||
await conn.executemany(sql, values)
|
||||
|
||||
|
||||
async def update_resource_meta(
|
||||
conn: Connection,
|
||||
site_id: int,
|
||||
resource_meta: dict,
|
||||
) -> None:
|
||||
"""
|
||||
Update meta information of existing resources using path to find them.
|
||||
"""
|
||||
sql = (
|
||||
"UPDATE resource SET last_change=coalesce($1, last_change),"
|
||||
" title=coalesce($2, title), summary=coalesce($3, summary) FROM ("
|
||||
" SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5"
|
||||
") sp WHERE resource.id=sp.resource_id"
|
||||
)
|
||||
values = ((*meta, site_id, path) for path, meta in resource_meta.items())
|
||||
await conn.executemany(sql, values)
|
||||
|
||||
|
||||
async def store_feed_entries(
|
||||
conn: Connection,
|
||||
site: Site,
|
||||
entries: list[dict],
|
||||
) -> None:
|
||||
"""
|
||||
Add missing resources of a site from given feed entries.
|
||||
"""
|
||||
if site.id_:
|
||||
paths, resource_meta = convert_feed_entries(site.base_url, entries)
|
||||
await add_site_paths(conn, site.id_, paths)
|
||||
await update_resource_meta(conn, site.id_, resource_meta)
|
||||
|
||||
|
||||
async def get_site_path(
|
||||
conn: Connection,
|
||||
site: Site,
|
||||
before: datetime,
|
||||
only_new=False,
|
||||
) -> Optional[SitePath]:
|
||||
"""
|
||||
Return the next path of a given site that needs to be processed.
|
||||
|
||||
If none needs to be processed, return None.
|
||||
|
||||
Only return paths that have last been visited before *before*
|
||||
or not been processed at all. Paths with a ok_count of -3 or lower
|
||||
are dropped.
|
||||
|
||||
If *only_new*, limit to paths that have not been processed at all,
|
||||
irrespective of the value of *before*.
|
||||
"""
|
||||
if only_new:
|
||||
sql = (
|
||||
"SELECT * FROM site_path"
|
||||
" WHERE site_id=$1 AND last_visit is null LIMIT 1"
|
||||
) # implicitly canonical=null
|
||||
row = await conn.fetchrow(sql, site.id_)
|
||||
else:
|
||||
sql = (
|
||||
"SELECT * FROM site_path"
|
||||
" WHERE site_id=$1 AND canonical IS NOT false AND"
|
||||
" (last_visit is null OR last_visit<$2) AND"
|
||||
" ok_count > -3 LIMIT 1"
|
||||
) # canonical can be true or null
|
||||
row = await conn.fetchrow(sql, site.id_, before)
|
||||
if row:
|
||||
return await SitePath().load_from_row(row)
|
||||
return None
|
||||
|
||||
|
||||
async def process_site_path(
|
||||
app,
|
||||
worker_number: int,
|
||||
conn: Connection,
|
||||
fetcher: ResourceFetcher,
|
||||
tf: TensorFlow,
|
||||
site: Site,
|
||||
site_path: SitePath,
|
||||
) -> bool:
|
||||
"""
|
||||
Fetch a path, deduplicate and if canonical, update and index the resource.
|
||||
|
||||
Return whether a new resource was handled that should contribute be
|
||||
statistics.
|
||||
"""
|
||||
msg = (
|
||||
f'Worker {worker_number} processing site {site.id_}'
|
||||
f' site_path {site_path.id_} {site.base_url}{site_path.path}'
|
||||
)
|
||||
logger.debug(msg)
|
||||
if not site.id_: # only to satisfy typing
|
||||
return False
|
||||
|
||||
# fetch url
|
||||
site_path.last_visit = datetime.utcnow()
|
||||
url = site_path.url(site)
|
||||
resource = await fetcher.fetch(url, site=site)
|
||||
|
||||
# handle failure (possibly deleting old information)
|
||||
if not isinstance(resource, (TextResource, MetaResource)):
|
||||
if not resource: # irrelevant content-type
|
||||
site_path.ok_count = -10
|
||||
elif isinstance(resource, ResourceError):
|
||||
site_path.ok_count -= 1
|
||||
if site_path.ok_count <= -3 and site_path.resource_id:
|
||||
await site_path.unlink_resource(
|
||||
conn,
|
||||
app.search_engine,
|
||||
app.config['elasticsearch']['index_base_name'],
|
||||
)
|
||||
await site_path.save(conn)
|
||||
if resource: # relevant content-type
|
||||
msg = (
|
||||
f'Worker {worker_number} failed to process site_path'
|
||||
f' {site_path.id_} (site {site.id_},'
|
||||
f' {site.base_url}{site_path.path})'
|
||||
)
|
||||
logger.info(msg)
|
||||
return False
|
||||
|
||||
# handle MetaResources
|
||||
if isinstance(resource, MetaResource):
|
||||
if isinstance(resource, Feed):
|
||||
resource.site_id = site.id_
|
||||
await resource.save(conn)
|
||||
if resource.entries:
|
||||
await store_feed_entries(conn, site, resource.entries)
|
||||
elif isinstance(resource, Sitemap):
|
||||
paths, _ = extract_sitemap_paths(site.base_url, resource.urls)
|
||||
await add_site_paths(conn, site.id_, paths)
|
||||
elif isinstance(resource, SitemapIndex):
|
||||
for sitemap_dict in resource.sitemaps:
|
||||
url = sitemap_dict['loc']
|
||||
res_sitemap = await fetcher.fetch(url, site=site)
|
||||
if isinstance(res_sitemap, Sitemap):
|
||||
paths, _ = extract_sitemap_paths(
|
||||
site.base_url, res_sitemap.urls
|
||||
)
|
||||
await add_site_paths(conn, site.id_, paths)
|
||||
return False
|
||||
|
||||
# handle TextResource
|
||||
relevant, is_new_resource = await _handle_text_resource(
|
||||
app, conn, tf, site, site_path, resource, url
|
||||
)
|
||||
if not relevant:
|
||||
return False
|
||||
site_path.resource_id = resource.id_
|
||||
site_path.canonical = resource.init_fields.get('canonical')
|
||||
site_path.ok_count += 1
|
||||
await site_path.save(conn)
|
||||
|
||||
if shortlink_url := resource.init_fields.get('shortlink'):
|
||||
await _save_shortlink(
|
||||
conn, site, url, resource, shortlink_url, site_path.last_visit
|
||||
)
|
||||
|
||||
return is_new_resource
|
||||
|
||||
|
||||
async def _handle_text_resource(
|
||||
app, conn, tf, site, site_path, resource, url
|
||||
) -> tuple[bool, bool]:
|
||||
"""
|
||||
Ingest a text resource.
|
||||
|
||||
Return whether the resource is relevant and whether it is new.
|
||||
"""
|
||||
# save the resource's internal links
|
||||
paths = []
|
||||
if links_int := resource.init_fields['links_int']:
|
||||
for durl, (rel, _) in links_int.items():
|
||||
rp_filter = app.plugins['filter_resource_path'].rp_filter
|
||||
if path := rp_filter(site, durl):
|
||||
canon = (rel and rel.lower() == 'canonical') or None
|
||||
paths.append((path, canon))
|
||||
await add_site_paths(conn, site.id_, paths)
|
||||
|
||||
# find resources similar to the current text
|
||||
text = resource.search_fields['text']
|
||||
if len(text) < 300: # discard resources with too short texts
|
||||
site_path.resource_id = None
|
||||
await site_path.save(conn)
|
||||
return False, False
|
||||
simhash = simhash_from_bigint(resource.simhash)
|
||||
index = site.simhash_index
|
||||
similar_ids = search_simhash(index, simhash)
|
||||
|
||||
# determine the destination resource and resources to be merged into it
|
||||
old_id = site_path.resource_id
|
||||
if (
|
||||
old_id
|
||||
and old_id in similar_ids
|
||||
and ( # similar to old text
|
||||
dest_resource := await TextResource().load(conn, old_id)
|
||||
)
|
||||
):
|
||||
merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
|
||||
else: # no old text, or old text not similar any more
|
||||
if old_id:
|
||||
await site_path.unlink_resource(
|
||||
conn,
|
||||
app.search_engine,
|
||||
app.config['elasticsearch']['index_base_name'],
|
||||
)
|
||||
# find the first existing similar resource
|
||||
for similar_id in similar_ids:
|
||||
dest_resource = await TextResource().load(conn, similar_id)
|
||||
if dest_resource:
|
||||
# also require similar length
|
||||
l1 = len(resource.search_fields['text'])
|
||||
l2 = dest_resource.text_len
|
||||
if 0.95 * l2 <= l1 <= 1.05 * l2:
|
||||
merge_ids = list(
|
||||
filter(lambda elem: elem != similar_id, similar_ids)
|
||||
)
|
||||
break
|
||||
else:
|
||||
dest_resource = None
|
||||
merge_ids = []
|
||||
|
||||
# update or create the destination resource
|
||||
if dest_resource:
|
||||
is_new_resource = False
|
||||
resource.simhash = create_simhash(index, dest_resource.id_, simhash)
|
||||
await dest_resource.update_from_resource(resource)
|
||||
resource = dest_resource
|
||||
else:
|
||||
is_new_resource = True
|
||||
resource.simhash = simhash_to_bigint(simhash)
|
||||
await resource.save(conn)
|
||||
create_simhash(index, resource.id_, simhash)
|
||||
|
||||
# add resource to search index
|
||||
if resource.content_type in ('html', 'plain'):
|
||||
await index_resource(
|
||||
app.search_engine,
|
||||
tf,
|
||||
site_path,
|
||||
resource,
|
||||
site.base_url,
|
||||
url,
|
||||
)
|
||||
|
||||
# merge resources: merge_ids -> resource
|
||||
for merge_id in merge_ids:
|
||||
# replace links to the merge resource with links to the dest resource
|
||||
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
|
||||
await conn.execute(sql, resource.id_ or None, merge_id)
|
||||
# remove orphaned merge resource
|
||||
sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
|
||||
found = await conn.fetchval(sql, merge_id)
|
||||
if found:
|
||||
await delete_resource(
|
||||
app.search_engine,
|
||||
found[1],
|
||||
merge_id,
|
||||
)
|
||||
|
||||
return True, is_new_resource
|
||||
|
||||
|
||||
async def _save_shortlink(
|
||||
conn, site, url, resource, shortlink_url, last_visit
|
||||
):
|
||||
"""
|
||||
Save a shortlink.
|
||||
"""
|
||||
shortlink_durl = await Durl(shortlink_url, base=site.base_url)
|
||||
if shortlink_durl and shortlink_url != url:
|
||||
sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
|
||||
sl_path = shortlink_durl.pwa()
|
||||
row = await conn.fetchrow(sql, site.id_, sl_path)
|
||||
shortlink = await SitePath().load_from_row(row)
|
||||
if not shortlink:
|
||||
shortlink = SitePath(
|
||||
site_id=site.id_,
|
||||
path=sl_path,
|
||||
last_visit=last_visit,
|
||||
ok_count=1,
|
||||
canonical=False,
|
||||
resource_id=resource.id_,
|
||||
)
|
||||
else:
|
||||
shortlink.last_visit = last_visit
|
||||
shortlink.ok_count += 1
|
||||
shortlink.canonical = False
|
||||
shortlink.resource_id = resource.id_
|
||||
await shortlink.save(conn)
|
355
src/atextcrawler/resource/page.py
Normal file
355
src/atextcrawler/resource/page.py
Normal file
|
@ -0,0 +1,355 @@
|
|||
"""
|
||||
Parse HTML pages.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
from typing import Optional, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from tidylib import tidy_document
|
||||
|
||||
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
||||
from ..utils.annotation import (
|
||||
annotate,
|
||||
annotations_remove_section,
|
||||
clean_annotations,
|
||||
get_tag_counts,
|
||||
headline_probability,
|
||||
)
|
||||
from ..utils.date_finder import extract_latest_date
|
||||
from ..utils.durl import Durl, assort_links
|
||||
from ..utils.html import (
|
||||
clean_body,
|
||||
clean_page,
|
||||
extract_title,
|
||||
get_html_lang,
|
||||
get_html_redirect,
|
||||
)
|
||||
from ..utils.http import get_header_links
|
||||
from ..utils.lang import extract_content_language
|
||||
from ..utils.section import iter_sections
|
||||
from ..utils.tag import keep_tags
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger_debug = logging.getLogger(__name__ + '.debug')
|
||||
logger_debug.setLevel(logging.INFO)
|
||||
logger_links = logging.getLogger(__name__ + '.debug.links')
|
||||
logger_stats = logging.getLogger(__name__ + '.debug.stats')
|
||||
logger_sections = logging.getLogger(__name__ + '.debug.sections')
|
||||
|
||||
|
||||
async def parse_html(
|
||||
durl: Durl,
|
||||
resp: dict,
|
||||
site: Optional[Site],
|
||||
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
|
||||
"""
|
||||
Extract relevant data from a response returning a TextResource instance.
|
||||
|
||||
The given URL must be the full URL (incl. scheme and netloc) of the page.
|
||||
"""
|
||||
html = resp['content']
|
||||
|
||||
# follow link to canonical URL
|
||||
header_links = await get_header_links(resp['headers'], durl, site)
|
||||
if canonical := header_links.get('canonical'):
|
||||
if canonical != durl.url():
|
||||
return ResourceRedirect(resp['redirects'] + [canonical])
|
||||
|
||||
# follow html redirect, if present
|
||||
if redir_url := get_html_redirect(html):
|
||||
if redir_url not in resp['redirects']:
|
||||
return ResourceRedirect(resp['redirects'] + [redir_url])
|
||||
else:
|
||||
msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
|
||||
return ResourceError(msg)
|
||||
|
||||
# require html tag
|
||||
if not html[:14].lower().startswith('<!doctype html'):
|
||||
if '<html' not in html:
|
||||
return None
|
||||
|
||||
# real URL after redirection
|
||||
url = resp['redirects'][-1]
|
||||
durl = await Durl(url)
|
||||
if not durl:
|
||||
return None
|
||||
|
||||
# page title
|
||||
title = extract_title(html)
|
||||
|
||||
# tidy html
|
||||
try:
|
||||
html, _ = tidy_document(
|
||||
html.encode('utf-8'),
|
||||
options={
|
||||
'logical-emphasis': 1,
|
||||
'merge-divs': 1,
|
||||
'merge-spans': 1,
|
||||
'hide-comments': 1,
|
||||
'output-bom': 0,
|
||||
'show-errors': 0,
|
||||
},
|
||||
)
|
||||
html = html.decode('utf-8')
|
||||
except:
|
||||
msg = f'Cannot tidy html from {url}'
|
||||
return ResourceError(msg)
|
||||
|
||||
# drop irrelevant tags, including their contents
|
||||
soup = clean_page(html)
|
||||
|
||||
# extract shortlink (from http headers or html head)
|
||||
shortlink = header_links.get('shortlink')
|
||||
if not shortlink and soup.head:
|
||||
for link in soup.head.find_all('link'):
|
||||
if 'shortlink' in link.get('rel', ''):
|
||||
if link.get('href'):
|
||||
shortlink = link.get('href')
|
||||
break
|
||||
|
||||
# language, plaintext, annotations, last change
|
||||
lang = get_html_lang(html)
|
||||
html = clean_body(str(soup.body))
|
||||
head = soup.head
|
||||
text, annotations = annotate(html)
|
||||
if lng := extract_content_language(text):
|
||||
lang = lng
|
||||
last_change = extract_latest_date(html, lang=lang)
|
||||
|
||||
# assort internal and external links
|
||||
base_url = None
|
||||
if head and head.base:
|
||||
base_url = head.base.get('href')
|
||||
if not base_url and site:
|
||||
base_url = site.base_url
|
||||
cleaned_links, links_int, links_ext = await assort_links(
|
||||
annotations['links'], durl, text, base_url
|
||||
)
|
||||
annotations['links'] = cleaned_links
|
||||
if logger_links.isEnabledFor(logging.DEBUG):
|
||||
logger_links.debug('==== internal links')
|
||||
for durl_, txt in links_int.items():
|
||||
logger_links.debug(f'{durl_.url()} {txt}')
|
||||
logger_links.debug('==== external links')
|
||||
for durl_, txt in links_ext.items():
|
||||
logger_links.debug(f'{durl_.url()} {txt}')
|
||||
|
||||
# keywords from category links
|
||||
category_links = set()
|
||||
for href, (i, f, rel) in annotations['links'].items():
|
||||
if rel and ('category' in rel or 'tag' in rel):
|
||||
category_links.add(text[i:f])
|
||||
keywords = sorted(category_links)
|
||||
|
||||
# filter out irrelevant sections
|
||||
filtered_text, filtered_ann = filter_sections(
|
||||
text, annotations, site.boilerplate_texts if site else None
|
||||
)
|
||||
|
||||
# debug statistics
|
||||
if logger_stats.isEnabledFor(logging.DEBUG):
|
||||
sb = annotations['semantic_breaks']
|
||||
fsb = filtered_ann['semantic_breaks']
|
||||
logger_stats.debug(
|
||||
f'Page statistics:'
|
||||
f' html_len={len(html)} text_len={len(filtered_text)}'
|
||||
f' ratio={len(filtered_text) / len(html):.2f};'
|
||||
f' sections={len(sb)} filtered_sections={len(fsb)}'
|
||||
f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
|
||||
)
|
||||
|
||||
return TextResource(
|
||||
content_type='html',
|
||||
last_change=last_change,
|
||||
text_len=len(text),
|
||||
lang=lang,
|
||||
title=title,
|
||||
init_fields={
|
||||
'durl': durl,
|
||||
'site': site,
|
||||
'headers': resp['headers'],
|
||||
'redirects': resp['redirects'],
|
||||
'links_int': links_int,
|
||||
'links_ext': links_ext,
|
||||
'shortlink': shortlink,
|
||||
'canonical': True if canonical else None,
|
||||
'head': head,
|
||||
},
|
||||
search_fields={
|
||||
'title': title,
|
||||
'pub_date': last_change,
|
||||
'keywords': keywords,
|
||||
'text': filtered_text,
|
||||
'annotations': filtered_ann,
|
||||
'head': str(head),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def filter_sections(text, annotations, boilerplate_texts):
|
||||
"""
|
||||
Filter out irrelevant sections using scores and factoring in neighbors.
|
||||
"""
|
||||
tags = annotations['tags']
|
||||
sb = annotations['semantic_breaks']
|
||||
section_ids = annotations['section_ids']
|
||||
|
||||
# for i1,f1 in sorted(tags.keys()):
|
||||
# print(' ', i1,f1,tags[(i1,f1)], text[i1:f1])
|
||||
# for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
||||
# print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
|
||||
# print('_' * 50)
|
||||
# from pprint import pprint
|
||||
# pprint(sb)
|
||||
# pprint(tags)
|
||||
# pprint(section_ids)
|
||||
|
||||
# calculate keep scores for sections
|
||||
# negative scores mean: drop; positive scores mean keep;
|
||||
# scores between -2 and 2 are undecided
|
||||
sections_keep = {}
|
||||
headline_probs = {}
|
||||
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
||||
if prob := headline_probability(txt, tags[(i, f)], lvl):
|
||||
headline_probs[(i, f)] = prob
|
||||
w = 0
|
||||
n_chars = f - i - 1
|
||||
# string length
|
||||
w = (n_chars - 80) / 80 # initial weight
|
||||
# punctuation
|
||||
w += 0.4 * text.count('.') + 0.1 * text.count(',')
|
||||
# p tag
|
||||
if 'p' in tags[(i + 1, f)]: # prefer keeping paragraphs
|
||||
w += 0.7
|
||||
# links
|
||||
n_links, link_density, avg_text_len = get_tag_counts(
|
||||
('a',), i, f, tags, text
|
||||
)
|
||||
if link_density > 0.5:
|
||||
w = -n_links
|
||||
elif link_density > 0.3 and avg_text_len < 60:
|
||||
w = -3
|
||||
else:
|
||||
n_li, li_density, li_len = get_tag_counts(
|
||||
('li',), i, f, tags, text
|
||||
)
|
||||
if link_density > 0.2 and li_density > 0.8 and li_len < 50:
|
||||
w = -3
|
||||
if 52 <= lvl < 60:
|
||||
w = max(w, 1.0)
|
||||
if 'sidebar' in ' '.join(section_ids.get(i, [])):
|
||||
w = -3
|
||||
if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
|
||||
w = -3
|
||||
# special chars
|
||||
if txt.startswith('←') or txt.endswith('→'): # wordpress navigation
|
||||
w = -3
|
||||
# remove boilerplate texts
|
||||
if boilerplate_texts and txt in boilerplate_texts:
|
||||
w = -10
|
||||
sections_keep[(i, f)] = w, lvl
|
||||
|
||||
# amend keep scores: look at preceding / subsequent sections with
|
||||
# equal level and transfer their keep scores to the current section
|
||||
n = len(sections_keep)
|
||||
sections = list(sorted(sections_keep.keys()))
|
||||
# inspect subsequent sections:
|
||||
for rev_ind, s_range in enumerate(reversed(sections)):
|
||||
ind = n - 1 - rev_ind
|
||||
w, lvl = sections_keep[s_range]
|
||||
if abs(w) <= 2:
|
||||
w_sum = 0
|
||||
n_peers = 0
|
||||
for i in range(ind + 1, min(n, ind + 15)):
|
||||
w_, lvl_ = sections_keep[sections[i]]
|
||||
if lvl_ != lvl:
|
||||
break
|
||||
n_peers += 1
|
||||
w_sum += w_
|
||||
if n_peers >= 3:
|
||||
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
|
||||
# inspect preceding sections:
|
||||
for ind, s_range in enumerate(sections):
|
||||
w, lvl = sections_keep[s_range]
|
||||
if abs(w) <= 2:
|
||||
w_sum = 0
|
||||
n_peers = 0
|
||||
for i in range(ind - 1, max(0, ind - 15), -1):
|
||||
w_, lvl_ = sections_keep[sections[i]]
|
||||
if lvl_ != lvl:
|
||||
break
|
||||
n_peers += 1
|
||||
w_sum += w_
|
||||
if n_peers >= 3:
|
||||
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
|
||||
|
||||
# amend keep scores: look at sections that could be headlines
|
||||
# for subsequent kept sections and increase their score;
|
||||
# also allow for up to 2 sections inbetween (which will also
|
||||
# have their score increased)
|
||||
for rev_ind, s_range in enumerate(reversed(sections)):
|
||||
ind = n - 1 - rev_ind
|
||||
w, lvl = sections_keep[s_range]
|
||||
if abs(w) <= 2:
|
||||
if headline_probs.get(s_range, 0) > 0.49:
|
||||
# look at subsequent sections with higher level
|
||||
child_weights = []
|
||||
for i in range(ind + 1, n):
|
||||
w_, lvl_ = sections_keep[sections[i]]
|
||||
if lvl_ <= lvl or w_ < -2:
|
||||
break
|
||||
child_weights.append(w_)
|
||||
if nc := len(child_weights):
|
||||
child_avg = sum(child_weights) / nc
|
||||
if w + 1.2 * child_avg > 2:
|
||||
sections_keep[s_range] = w + 1.2 * child_avg, lvl
|
||||
if nc > 1:
|
||||
if (w1 := child_weights[0]) <= 2:
|
||||
sections_keep[sections[ind + 1]] = (
|
||||
w1 + 1.5 * child_avg,
|
||||
lvl,
|
||||
)
|
||||
if nc > 2:
|
||||
if (w2 := child_weights[1]) <= 2:
|
||||
sections_keep[sections[ind + 2]] = (
|
||||
w2 + 2 * child_avg,
|
||||
lvl,
|
||||
)
|
||||
|
||||
# clean annotations
|
||||
clean_annotations(annotations)
|
||||
|
||||
# debug sections
|
||||
if logger_sections.isEnabledFor(logging.DEBUG):
|
||||
logger_sections.debug('============= Weighted sections =============')
|
||||
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
||||
w, lvl = sections_keep[(i, f)]
|
||||
indent = ('+' if w > 2 else '-') * lvl
|
||||
ts = ','.join(tags[(i + 1, f)])
|
||||
logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
|
||||
|
||||
# narrow down annotations and text to keep_sections
|
||||
# drop undecided sections
|
||||
filtered_text = text
|
||||
filtered_ann = deepcopy(annotations)
|
||||
for i, f in sorted(sections_keep.keys(), reverse=True):
|
||||
w, lvl = sections_keep[(i, f)]
|
||||
if w <= 2.0:
|
||||
filtered_ann = annotations_remove_section(filtered_ann, i, f)
|
||||
filtered_text = filtered_text[:i] + filtered_text[f:]
|
||||
clean_annotations(filtered_ann)
|
||||
|
||||
# debug filtered sections
|
||||
if logger_sections.isEnabledFor(logging.DEBUG):
|
||||
logger_sections.debug('')
|
||||
logger_sections.debug('============= Filtered sections =============')
|
||||
fsb = filtered_ann['semantic_breaks']
|
||||
ftags = filtered_ann['tags']
|
||||
for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
|
||||
indent = ' ' * lvl
|
||||
ts = ','.join(ftags.get((i + 1, f), []))
|
||||
logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
|
||||
|
||||
return filtered_text, filtered_ann
|
148
src/atextcrawler/resource/plaintext.py
Normal file
148
src/atextcrawler/resource/plaintext.py
Normal file
|
@ -0,0 +1,148 @@
|
|||
"""
|
||||
Parse plaintext pages.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import pypandoc
|
||||
|
||||
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
||||
from ..utils.annotation import annotate
|
||||
from ..utils.date_finder import extract_latest_date
|
||||
from ..utils.durl import Durl
|
||||
from ..utils.http import get_header_links
|
||||
from ..utils.lang import extract_content_language
|
||||
from ..utils.muse import parse_muse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MAX_LINK_TEXT_LENGTH = 100
|
||||
"""
|
||||
Maximum length of a link's text to be kept.
|
||||
|
||||
Cf. table site_link, column link_text.
|
||||
"""
|
||||
|
||||
|
||||
re_url = re.compile(
|
||||
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
|
||||
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
|
||||
)
|
||||
|
||||
|
||||
re_nl = re.compile(r'\r\n')
|
||||
|
||||
|
||||
re_ws = re.compile(r'\s*\n\s*\n\s*')
|
||||
|
||||
|
||||
re_nn = re.compile(r'\n\n')
|
||||
|
||||
|
||||
async def parse_plaintext(
|
||||
durl: Durl,
|
||||
resp: dict,
|
||||
site: Optional[Site],
|
||||
) -> Optional[Union[ResourceRedirect, TextResource]]:
|
||||
"""
|
||||
Extract relevant data from a response returning a TextResource instance.
|
||||
|
||||
The given URL must be the full URL (incl. scheme and netloc) of the page.
|
||||
"""
|
||||
text = resp['content']
|
||||
|
||||
# HTTP headers, canonical URL, shortlink
|
||||
header_links = await get_header_links(resp['headers'], durl, site)
|
||||
if canonical := header_links.get('canonical'):
|
||||
if canonical != durl.url():
|
||||
return ResourceRedirect(resp['redirects'] + [canonical])
|
||||
shortlink = header_links.get('shortlink')
|
||||
|
||||
if not text:
|
||||
return None
|
||||
|
||||
text = re_nl.sub('\n', text)
|
||||
text = re_ws.sub('\n\n', text)
|
||||
|
||||
# meta info
|
||||
meta: dict[str, Any] = {}
|
||||
muse = None
|
||||
if durl.path.endswith('.muse'):
|
||||
muse = parse_muse(text)
|
||||
if muse:
|
||||
meta, text = muse
|
||||
# title
|
||||
if not meta.get('title'):
|
||||
meta['title'] = text[:200].splitlines()[0]
|
||||
# content language
|
||||
if not meta.get('lang'):
|
||||
meta['lang'] = extract_content_language(text)
|
||||
# publication date
|
||||
if not meta.get('pub_date'):
|
||||
meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
|
||||
|
||||
# links
|
||||
links_int: dict[Durl, tuple[list[str], str]] = {}
|
||||
links_ext: dict[Durl, tuple[list[str], str]] = {}
|
||||
for url in re_url.findall(text):
|
||||
link_durl = await Durl(url[0])
|
||||
if link_durl:
|
||||
if link_durl.site() == durl.site():
|
||||
links_int[link_durl] = [], link_durl.url()
|
||||
else:
|
||||
links_ext[link_durl] = [], link_durl.url()
|
||||
|
||||
if muse:
|
||||
html = pypandoc.convert_text(text, 'html5', format='muse').strip()
|
||||
text, annotations = annotate(html)
|
||||
else:
|
||||
text, annotations = annotate_text(text)
|
||||
|
||||
return TextResource(
|
||||
content_type=resp['parser'],
|
||||
last_change=meta.get('pub_date'),
|
||||
text_len=len(text),
|
||||
lang=meta.get('lang'),
|
||||
title=meta.get('title'),
|
||||
init_fields={
|
||||
'durl': durl,
|
||||
'site': site,
|
||||
'headers': resp['headers'],
|
||||
'redirects': resp['redirects'],
|
||||
'links_int': links_int,
|
||||
'links_ext': links_ext,
|
||||
'shortlink': shortlink,
|
||||
'canonical': None,
|
||||
},
|
||||
search_fields={
|
||||
'title': meta.get('title'),
|
||||
'authors': meta.get('authors'),
|
||||
'pub_date': meta.get('pub_date'),
|
||||
'keywords': meta.get('keywords'),
|
||||
'summary': meta.get('summary'),
|
||||
'text': text,
|
||||
'annotations': annotations,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def annotate_text(text):
|
||||
"""
|
||||
Return annoations as :func:`utils.annotation.annotate`does.
|
||||
|
||||
Here we only have information on semantic breaks
|
||||
(in plaintext they are where empty lines are).
|
||||
"""
|
||||
semantic_breaks = {}
|
||||
for match in re_nn.finditer(text):
|
||||
semantic_breaks[match.span()[0]] = ''
|
||||
annotations = {
|
||||
'tags': {},
|
||||
'semantic_breaks': semantic_breaks,
|
||||
'section_ids': {},
|
||||
'links': {},
|
||||
}
|
||||
return text, annotations
|
149
src/atextcrawler/resource/sitemap.py
Normal file
149
src/atextcrawler/resource/sitemap.py
Normal file
|
@ -0,0 +1,149 @@
|
|||
"""
|
||||
Sitemap and SitemapIndex and related operations.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import pytz
|
||||
|
||||
from ..models import Sitemap, SitemapIndex, TextResource
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def get_sitemap_urls(
|
||||
fetcher,
|
||||
base_url: Optional[str],
|
||||
sitemaps=None,
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Try to find sitemaps and fetch and return their URL content.
|
||||
|
||||
Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
|
||||
"""
|
||||
if sitemaps:
|
||||
# test example: https://www.berlin.de/
|
||||
check_all = True
|
||||
elif base_url:
|
||||
sitemaps = [
|
||||
base_url.rstrip('/') + '/sitemap.xml',
|
||||
base_url.rstrip('/') + '/wp-sitemap.xml',
|
||||
base_url.rstrip('/') + '/sitemap_index.xml',
|
||||
base_url.rstrip('/') + '/sitemap.xml.gz',
|
||||
base_url.rstrip('/') + '/sitemap_index.xml.gz',
|
||||
base_url.rstrip('/') + '/sitemap.txt',
|
||||
base_url.rstrip('/') + '/sitemap/',
|
||||
base_url.rstrip('/') + '/sitemap1.xml',
|
||||
base_url.rstrip('/') + '/sitemap-index.xml',
|
||||
base_url.rstrip('/') + '/sitemapindex.xml',
|
||||
base_url.rstrip('/') + '/sitemap/index.xml',
|
||||
]
|
||||
check_all = False
|
||||
else:
|
||||
return []
|
||||
urls = []
|
||||
for sitemap in sitemaps:
|
||||
resource = await fetcher.fetch(sitemap)
|
||||
found = True
|
||||
if isinstance(resource, SitemapIndex):
|
||||
for sitemap_ in resource.sitemaps:
|
||||
sitemaps.append(sitemap_['loc'])
|
||||
elif isinstance(resource, Sitemap):
|
||||
urls += resource.urls
|
||||
elif isinstance(resource, TextResource) and resource.content_type in (
|
||||
'html',
|
||||
'plain',
|
||||
):
|
||||
urls += [
|
||||
{'loc': durl.url()}
|
||||
for durl in resource.init_fields['links_int']
|
||||
]
|
||||
else:
|
||||
found = False
|
||||
if found and not check_all:
|
||||
break
|
||||
return urls
|
||||
|
||||
|
||||
def parse_sitemapindex(sitemapindex):
|
||||
"""
|
||||
Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
|
||||
"""
|
||||
sitemaps = []
|
||||
for tag in sitemapindex.find_all('sitemap'):
|
||||
if loc := tag.find('loc'):
|
||||
if loc.string:
|
||||
sitemap = {'loc': loc.string.strip()}
|
||||
if lastmod := tag.find('lastmod'):
|
||||
try:
|
||||
t = datetime.fromisoformat(lastmod.string.strip())
|
||||
sitemap['lastmod'] = t
|
||||
except:
|
||||
pass
|
||||
sitemaps.append(sitemap)
|
||||
return SitemapIndex(sitemaps=sitemaps)
|
||||
|
||||
|
||||
def parse_sitemap(urlset) -> Sitemap:
|
||||
"""
|
||||
Return a list of sitemap URLs.
|
||||
|
||||
Each URL is a dict with these keys+values:
|
||||
|
||||
* loc: the full URL of a mapped resource
|
||||
* lastmod: optional datetime of its last modification
|
||||
* changefreq: optional info on the change frequency to be expected
|
||||
* priority: optional info on its priority relative to other resources
|
||||
|
||||
Cf. https://www.sitemaps.org/protocol.html
|
||||
"""
|
||||
urls = []
|
||||
for tag in urlset.find_all('url'):
|
||||
if loc := tag.find('loc'):
|
||||
if loc.string:
|
||||
url = {'loc': loc.string.strip()}
|
||||
if lastmod := tag.find('lastmod'):
|
||||
try:
|
||||
t = lastmod.string.strip().rstrip('Z')
|
||||
url['lastmod'] = (
|
||||
datetime.fromisoformat(t)
|
||||
.astimezone(pytz.utc)
|
||||
.replace(tzinfo=None)
|
||||
)
|
||||
except:
|
||||
pass
|
||||
if changefreq := tag.find('changefreq'):
|
||||
url['changefreq'] = changefreq.string.strip()
|
||||
if priority := tag.find('priority'):
|
||||
url['priority'] = priority.string.strip()
|
||||
urls.append(url)
|
||||
return Sitemap(urls=urls)
|
||||
|
||||
|
||||
def extract_sitemap_paths(
|
||||
base_url: Optional[str],
|
||||
urls: list[dict],
|
||||
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
|
||||
"""
|
||||
Extract essential information from sitemap URLs.
|
||||
|
||||
Return a list of relative paths of the site's resources
|
||||
(in a form to be easily fed into `add_site_paths`) and
|
||||
the datetime of the latest change.
|
||||
|
||||
Relative paths are computed using base_url.
|
||||
"""
|
||||
paths = []
|
||||
latest = None
|
||||
for url in urls:
|
||||
loc = url['loc']
|
||||
lastmod = url.get('lastmod')
|
||||
if loc.startswith(base_url or ''):
|
||||
path = loc.removeprefix(base_url or '').lstrip('/')
|
||||
path = path.split('#', 1)[0]
|
||||
paths.append((path, True))
|
||||
if lastmod:
|
||||
latest = max(lastmod, latest or lastmod)
|
||||
return paths, latest
|
6
src/atextcrawler/search/__init__.py
Normal file
6
src/atextcrawler/search/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from .engine import (
|
||||
delete_resource,
|
||||
index_resource,
|
||||
shutdown_engine,
|
||||
startup_engine,
|
||||
)
|
270
src/atextcrawler/search/engine.py
Normal file
270
src/atextcrawler/search/engine.py
Normal file
|
@ -0,0 +1,270 @@
|
|||
"""
|
||||
Search engine, for now elasticsearch.
|
||||
|
||||
We have one index per supported language and a default one.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import warnings
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Union
|
||||
|
||||
from elasticsearch import AsyncElasticsearch
|
||||
from elasticsearch.exceptions import NotFoundError
|
||||
|
||||
from ..utils.annotation import pack_annotations
|
||||
from ..utils.section import concat_section_texts
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
warnings.filterwarnings(
|
||||
'ignore',
|
||||
'The client is unable to verify that the'
|
||||
' server is Elasticsearch due security privileges on the server side',
|
||||
)
|
||||
|
||||
|
||||
MIN_INDEXING_TIMEOUT_SECONDS = 5
|
||||
|
||||
|
||||
language_analyzers = {
|
||||
'en': 'english',
|
||||
'de': 'german',
|
||||
#'fr': 'french',
|
||||
#'el': 'greek',
|
||||
#'es': 'spanish',
|
||||
'default': 'standard',
|
||||
}
|
||||
|
||||
|
||||
properties = {
|
||||
'resource_id': {'type': 'long'},
|
||||
'site_id': {'type': 'long'},
|
||||
'url': {'type': 'text'},
|
||||
'base_url': {'type': 'text'},
|
||||
'pub_date': {'type': 'date', 'format': 'yyyy-MM-dd||yyyy-MM||yyyy'},
|
||||
'lang': {'type': 'keyword'},
|
||||
'title': {'type': 'text'},
|
||||
'authors': {'type': 'text'},
|
||||
'summary': {'type': 'text'},
|
||||
'keywords': {'type': 'text'},
|
||||
'collections': {'type': 'keyword'},
|
||||
'time_horizon': {'type': 'keyword'},
|
||||
'orig_source': {'type': 'text'},
|
||||
'topics': {'type': 'text'},
|
||||
'annotations': {'type': 'text', 'index': False},
|
||||
'sections': {
|
||||
'type': 'nested',
|
||||
'properties': {
|
||||
'start_ids': {'type': 'integer'},
|
||||
'end_ids': {'type': 'integer'},
|
||||
'text': {'type': 'text', 'index_options': 'offsets'},
|
||||
'embedding': {'type': 'dense_vector', 'dims': 512},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def startup_engine(config):
|
||||
"""
|
||||
Open the search engine for access.
|
||||
"""
|
||||
engine = AsyncElasticsearch(
|
||||
host=config['elasticsearch']['host'],
|
||||
api_key=(
|
||||
config['elasticsearch']['id'],
|
||||
config['elasticsearch']['api_key'],
|
||||
),
|
||||
use_ssl=False,
|
||||
timeout=20,
|
||||
)
|
||||
engine.index_base_name = config['elasticsearch']['index_base_name']
|
||||
await create_indices(engine)
|
||||
await open_indices(engine)
|
||||
return engine
|
||||
|
||||
|
||||
async def create_indices(engine):
|
||||
"""
|
||||
Create indices for all configured langiages.
|
||||
"""
|
||||
for lang, analyzer in language_analyzers.items():
|
||||
index_name = engine.index_base_name + '_text_' + lang
|
||||
if not await engine.indices.exists(index=index_name):
|
||||
await engine.indices.create(index=index_name)
|
||||
await engine.indices.close(index=index_name)
|
||||
await engine.indices.put_settings(
|
||||
index=index_name,
|
||||
body={
|
||||
'analysis': {'analyzer': {'default': {'type': analyzer}}},
|
||||
'refresh_interval': '60s',
|
||||
},
|
||||
)
|
||||
await engine.indices.put_mapping(
|
||||
index=index_name,
|
||||
body={'properties': properties},
|
||||
)
|
||||
|
||||
|
||||
async def open_indices(engine):
|
||||
"""
|
||||
Open indices for all configure languages.
|
||||
"""
|
||||
for lang in language_analyzers.keys():
|
||||
index_name = engine.index_base_name + '_text_' + lang
|
||||
await engine.indices.open(index=index_name)
|
||||
|
||||
|
||||
async def shutdown_engine(engine):
|
||||
"""
|
||||
Close the connection to the search engine.
|
||||
"""
|
||||
# await close_indices(engine)
|
||||
await engine.close()
|
||||
|
||||
|
||||
async def close_indices(engine):
|
||||
"""
|
||||
Close indices. UNUSED.
|
||||
"""
|
||||
for lang in language_analyzers.keys():
|
||||
index_name = engine.index_base_name + '_text_' + lang
|
||||
await engine.indices.close(index=index_name)
|
||||
|
||||
|
||||
async def index_resource(
|
||||
engine,
|
||||
tf,
|
||||
site_path,
|
||||
resource,
|
||||
base_url,
|
||||
url,
|
||||
):
|
||||
"""
|
||||
Index a resource.
|
||||
"""
|
||||
lang = resource.lang
|
||||
index_lang = lang if lang in language_analyzers.keys() else 'default'
|
||||
index_name = engine.index_base_name + '_text_' + index_lang
|
||||
pub_date = resource.search_fields.get('pub_date')
|
||||
if pub_date:
|
||||
pub_date = str(pub_date.date())
|
||||
text = resource.search_fields.get('text')
|
||||
annotations = resource.search_fields.get('annotations')
|
||||
semantic_breaks = annotations['semantic_breaks']
|
||||
sections = []
|
||||
for section_ids, txt in concat_section_texts(text, semantic_breaks):
|
||||
embedding = await tf.embed(txt)
|
||||
sections.append(
|
||||
{
|
||||
'start_ids': section_ids[0],
|
||||
'end_ids': section_ids[-1],
|
||||
'text': txt,
|
||||
'embedding': embedding,
|
||||
}
|
||||
)
|
||||
doc = {
|
||||
'resource_id': resource.id_,
|
||||
'site_id': site_path.site_id,
|
||||
'url': url,
|
||||
'base_url': base_url,
|
||||
'pub_date': pub_date,
|
||||
'lang': resource.lang,
|
||||
'title': resource.search_fields.get('title'),
|
||||
'authors': resource.search_fields.get('authors'),
|
||||
'summary': resource.search_fields.get('summary'),
|
||||
'keywords': resource.search_fields.get('keywords'),
|
||||
'collections': resource.search_fields.get('collections'),
|
||||
'time_horizon': resource.search_fields.get('time_horizon'),
|
||||
'orig_source': resource.search_fields.get('orig_source'),
|
||||
'topics': resource.search_fields.get('topics'),
|
||||
'annotations': pack_annotations(annotations),
|
||||
'sections': sections,
|
||||
}
|
||||
timeout_seconds = max(MIN_INDEXING_TIMEOUT_SECONDS, int(len(text) / 1000))
|
||||
await engine.index(
|
||||
id=resource.id_,
|
||||
index=index_name,
|
||||
body=doc,
|
||||
timeout=f'{timeout_seconds}s',
|
||||
)
|
||||
|
||||
|
||||
async def delete_resource(engine, lang, resource_id):
|
||||
"""
|
||||
Delete a resource.
|
||||
"""
|
||||
index_name = engine.index_base_name + '_text_' + (lang or 'default')
|
||||
try:
|
||||
await engine.delete(index_name, resource_id)
|
||||
except NotFoundError:
|
||||
msg = f'Cannot delete resource from index, not found: {resource_id}'
|
||||
logger.warning(msg)
|
||||
|
||||
|
||||
async def find_duplicate(engine, site_id, resource) -> Union[bool, None, int]:
|
||||
"""
|
||||
UNUSED.
|
||||
|
||||
Try to find a duplicate resource with matching site.
|
||||
|
||||
If the search backend query fails, return False.
|
||||
If no matching resource was found, return None.
|
||||
If a matching resource was found, return its id.
|
||||
"""
|
||||
# get sample texts
|
||||
text = resource.search_fields['text']
|
||||
if not text or len(text) < 100:
|
||||
return None
|
||||
# annotations = resource.search_fields['annotations']
|
||||
# semantic_breaks = annotations['semantic_breaks']
|
||||
# texts = []
|
||||
# for _, txt in concat_section_texts(text, semantic_breaks):
|
||||
# texts.append(txt)
|
||||
# texts = extract_samples(texts)
|
||||
|
||||
# # search for sample texts
|
||||
# text_count = len(texts)
|
||||
# should_min = max(1, int(0.6 * text_count))
|
||||
# should = []
|
||||
# for text in texts:
|
||||
# should.append({'match': {'sections.text': text}})
|
||||
query = {
|
||||
'bool': {
|
||||
'must': {
|
||||
'nested': {
|
||||
'path': 'sections',
|
||||
'query': {'match': {'sections.text': text}},
|
||||
},
|
||||
},
|
||||
'filter': {
|
||||
'term': {
|
||||
'site_id': site_id,
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
fields = [
|
||||
'url',
|
||||
'sections.text',
|
||||
'site_id',
|
||||
]
|
||||
response = await engine.search(
|
||||
index=engine.index_base_name + '_text_*',
|
||||
body={
|
||||
'query': query,
|
||||
'fields': fields,
|
||||
'from': 0,
|
||||
'size': 3,
|
||||
'_source': False,
|
||||
},
|
||||
)
|
||||
if response['timed_out']:
|
||||
return False
|
||||
for hit in response.get('hits', {}).get('hits'):
|
||||
txt = ' '.join(hit['fields']['sections.text'])
|
||||
similarity = SequenceMatcher(None, text, txt).ratio()
|
||||
if similarity > 0.99:
|
||||
return hit['_id']
|
||||
return None
|
9
src/atextcrawler/site/__init__.py
Normal file
9
src/atextcrawler/site/__init__.py
Normal file
|
@ -0,0 +1,9 @@
|
|||
"""
|
||||
Websites.
|
||||
"""
|
||||
|
||||
from .feeds import fetch_feeds
|
||||
from .operations import checkin_site, checkout_site, process_site, update_site
|
||||
from .queue import process_site_queue
|
||||
from .robots import RobotsInfo
|
||||
from .seed import load_seeds
|
68
src/atextcrawler/site/__main__.py
Normal file
68
src/atextcrawler/site/__main__.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
"""
|
||||
Tool for analyzing a website.
|
||||
|
||||
Fetch the startpage and output information to console.
|
||||
Do not change any persistent data.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
|
||||
from ..models import TextResource
|
||||
from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
|
||||
from ..site.robots import RobotsInfo
|
||||
from ..utils.durl import Durl
|
||||
from .parse import parse_startpage
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.WARNING)
|
||||
logger.addHandler(logging.StreamHandler())
|
||||
|
||||
|
||||
async def run():
|
||||
"""
|
||||
Fetch the startpage of a website and show information about it.
|
||||
|
||||
The URL must be given as commandline argument.
|
||||
"""
|
||||
base_url = sys.argv[1]
|
||||
async with aiohttp.ClientSession() as session:
|
||||
if not (base_durl := await Durl(base_url)):
|
||||
return
|
||||
fetcher = ResourceFetcher(session)
|
||||
resource = await fetcher.fetch(base_url)
|
||||
logger.warning(repr(resource))
|
||||
if (
|
||||
isinstance(resource, TextResource)
|
||||
and resource.content_type == 'html'
|
||||
):
|
||||
site = await parse_startpage(resource)
|
||||
# site.crawl_enabled = await site_filter(site)
|
||||
logger.warning(repr(site))
|
||||
logger.warning('')
|
||||
for durl, text in site.links_ext.items():
|
||||
logger.warning(f' {durl} {text}')
|
||||
logger.warning(f'{durl.url()} -------- {text}')
|
||||
logger.warning('')
|
||||
logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
|
||||
logger.warning('')
|
||||
robots = await RobotsInfo(base_url)
|
||||
urls = await get_sitemap_urls(
|
||||
fetcher, base_url, sitemaps=robots.site_maps
|
||||
)
|
||||
paths, latest = extract_sitemap_paths(base_url, urls)
|
||||
for path in paths:
|
||||
logger.warning(path)
|
||||
logger.warning(f'Feeds: {site.feeds}')
|
||||
logger.warning(latest)
|
||||
# sample_links = extract_samples(resource.init_fields['links_int'])
|
||||
# logger.warning(f'************* {sample_links}')
|
||||
else:
|
||||
logger.warning('(No text resource or error.)')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(run())
|
100
src/atextcrawler/site/feeds.py
Normal file
100
src/atextcrawler/site/feeds.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
"""
|
||||
High-level feed-related stuff.
|
||||
|
||||
See resource.feed for low-level stuff not primarily related to sites.
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from ..models import Feed
|
||||
from ..resource import store_feed_entries, update_feed
|
||||
|
||||
|
||||
async def store_new_feeds(conn, site_id, feeds: dict):
|
||||
"""
|
||||
Store new feeds in table site_feed.
|
||||
"""
|
||||
sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1"
|
||||
known_feeds = (await conn.fetchval(sql, site_id)) or []
|
||||
for feed_url in feeds.keys():
|
||||
if feed_url not in known_feeds:
|
||||
feed = Feed(
|
||||
site_id=site_id,
|
||||
url=feed_url,
|
||||
)
|
||||
await feed.save(conn)
|
||||
|
||||
|
||||
async def get_feeds(conn, site_id) -> list[Feed]:
|
||||
"""
|
||||
Return stored feeds for the given site.
|
||||
"""
|
||||
sql = "SELECT * FROM site_feed WHERE site_id=$1"
|
||||
rows = (await conn.fetch(sql, site_id)) or []
|
||||
return [(await Feed().load_from_row(row)) for row in rows]
|
||||
|
||||
|
||||
async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]:
|
||||
"""
|
||||
Fetch feeds, add new resources and return the latest content update time.
|
||||
"""
|
||||
feeds = await get_feeds(conn, site.id_)
|
||||
latest = None
|
||||
for feed in feeds:
|
||||
feed_content = await update_feed(fetcher, feed, conn)
|
||||
if feed_content:
|
||||
await store_feed_entries(conn, site, feed_content)
|
||||
if feed.t_content:
|
||||
latest = max(latest or feed.t_content, feed.t_content)
|
||||
return latest
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# only use this on a dev instance!
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
|
||||
from ..config import Config
|
||||
from ..db import PGPool
|
||||
from ..resource.fetch import ResourceFetcher
|
||||
from .operations import process_site, update_site
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
config = Config().get()
|
||||
url = sys.argv[1]
|
||||
|
||||
async def run():
|
||||
"""
|
||||
Fetch and display a site.
|
||||
"""
|
||||
app = None # TODO
|
||||
async with PGPool(config['postgresql']) as pool:
|
||||
async with pool.acquire() as conn:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
fetcher = ResourceFetcher(session)
|
||||
site, _ = await update_site(app, fetcher, conn, url)
|
||||
logger.warning(site)
|
||||
await process_site(fetcher, conn, site)
|
||||
latest = await fetch_feeds(fetcher, conn, site)
|
||||
logger.warning(f'latest: {latest}')
|
||||
# feed = Feed(url=url)
|
||||
# feed_content = await update_feed(fetcher, feed, conn)
|
||||
# if isinstance(feed_content, ResourceError):
|
||||
# print(feed_content)
|
||||
# else:
|
||||
# print(feed)
|
||||
# pprint(feed_content[0])
|
||||
# print('---- 2nd try ----')
|
||||
# feed_content = await update_feed(fetcher, feed, conn)
|
||||
# if isinstance(feed_content, ResourceError):
|
||||
# print(feed_content)
|
||||
# else:
|
||||
# print(feed)
|
||||
# pprint(feed_content[0])
|
||||
|
||||
asyncio.run(run())
|
267
src/atextcrawler/site/operations.py
Normal file
267
src/atextcrawler/site/operations.py
Normal file
|
@ -0,0 +1,267 @@
|
|||
"""
|
||||
Operations on sites.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
|
||||
from asyncpg import Connection
|
||||
|
||||
from ..models import Crawl, Site, TextResource
|
||||
from ..resource import (
|
||||
add_site_paths,
|
||||
extract_sitemap_paths,
|
||||
get_sitemap_urls,
|
||||
store_boilerplate_texts,
|
||||
)
|
||||
from ..utils.durl import Durl
|
||||
from ..utils.similarity import get_simhash_index
|
||||
from .feeds import fetch_feeds, store_new_feeds
|
||||
from .parse import parse_startpage
|
||||
from .robots import RobotsInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def checkout_site(
|
||||
app, conn: Connection
|
||||
) -> tuple[Optional[int], bool, bool]:
|
||||
"""
|
||||
Get the id of a site to be crawled and mark it with crawl_active=true.
|
||||
|
||||
Also return whether the site shall be fully crawled; if not, this
|
||||
means that just the resources from the feeds shall be crawled.
|
||||
|
||||
Also return whether more sites might be available.
|
||||
"""
|
||||
async with conn.transaction():
|
||||
sql = (
|
||||
"SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
|
||||
" FROM site WHERE crawl_enabled AND crawl_active = false"
|
||||
" AND (next_full_crawl < now() at time zone 'UTC'"
|
||||
" OR next_feed_crawl < now() at time zone 'UTC')"
|
||||
" LIMIT 1 FOR UPDATE SKIP LOCKED"
|
||||
)
|
||||
row = await conn.fetchrow(sql)
|
||||
if row:
|
||||
site_id = row['id']
|
||||
is_full = row['is_full']
|
||||
sql = "UPDATE site SET crawl_active = true WHERE id=$1"
|
||||
await conn.execute(sql, site_id)
|
||||
site = await Site().load(conn, site_id)
|
||||
if site:
|
||||
site.base_durl = await Durl(site.base_url)
|
||||
if site.base_durl:
|
||||
site.simhash_index = await get_simhash_index(conn, site_id)
|
||||
return site, is_full, True
|
||||
else:
|
||||
# site not available; schedule next crawl
|
||||
int_full = app.config['crawl']['full_crawl_interval']
|
||||
int_feed = app.config['crawl']['feed_crawl_interval']
|
||||
now = datetime.utcnow()
|
||||
t_full = now + timedelta(seconds=int_full)
|
||||
t_feed = now + timedelta(seconds=int_full + int_feed)
|
||||
sql = (
|
||||
"UPDATE site SET crawl_active=false,"
|
||||
" next_full_crawl=$1, next_feed_crawl=$2"
|
||||
" WHERE id=$3"
|
||||
)
|
||||
await conn.execute(sql, t_full, t_feed, site_id)
|
||||
return None, False, True
|
||||
return None, False, True
|
||||
return None, False, False
|
||||
|
||||
|
||||
async def update_site(
|
||||
app, fetcher, conn: Connection, base_url, site: Site = None
|
||||
) -> tuple[Optional[Site], bool]:
|
||||
"""
|
||||
Try to fetch base_url and return a site and whether a new one was created.
|
||||
|
||||
This function is run for all sites (including blacklisted and irrelevant
|
||||
ones). It determines whether the site shall be crawled.
|
||||
|
||||
If an errors occurs, return (None, False), and if a site was given,
|
||||
also set it to crawl_enabled=False and remove crawling schedules.
|
||||
|
||||
If base_url could be fetched, update the site, possibly creating
|
||||
a new one.
|
||||
|
||||
If the site has crawl_enabled, and no full crawl is scheduled,
|
||||
schedule one (by updating column `next_full_crawl`).
|
||||
"""
|
||||
# fetch startpage
|
||||
logger.info(f'Updating site={site}, base_url={base_url}')
|
||||
resource = await fetcher.fetch(base_url, site=site)
|
||||
if (
|
||||
not isinstance(resource, TextResource)
|
||||
or resource.content_type != 'html'
|
||||
):
|
||||
if site:
|
||||
site.meta_info['error'] = 'Invalid start page'
|
||||
site.crawl_enabled = False
|
||||
site.next_full_crawl = None
|
||||
site.next_feed_crawl = None
|
||||
await site.save(conn)
|
||||
logger.info(f'Failed startpage {base_url}: {resource}')
|
||||
return None, False
|
||||
|
||||
# parse startpage (extract site information) and save the site
|
||||
site = await parse_startpage(resource, app=app, site=site)
|
||||
site_id, created = await site.save(conn)
|
||||
if created:
|
||||
logger.debug(f'Created {site}')
|
||||
|
||||
# add black-/white-listing info
|
||||
is_allowed = await is_site_allowed(conn, site.id_, base_url)
|
||||
if is_allowed is not None and is_allowed != site.crawl_enabled:
|
||||
site.crawl_enabled = is_allowed
|
||||
await site.save(conn)
|
||||
|
||||
# schedule full crawl, if none is scheduled and the site shall be crawled
|
||||
if site.crawl_enabled:
|
||||
sql = (
|
||||
"UPDATE site"
|
||||
" SET next_full_crawl=now() at time zone 'UTC'"
|
||||
" WHERE id=$1 AND next_full_crawl IS null"
|
||||
)
|
||||
await conn.execute(sql, site_id)
|
||||
|
||||
return site, created
|
||||
|
||||
|
||||
async def is_site_allowed(
|
||||
conn: Connection,
|
||||
site_id: Optional[int],
|
||||
base_url: str,
|
||||
) -> Optional[bool]:
|
||||
"""
|
||||
Return True if the site is whitelisted, False if blacklisted, else None.
|
||||
|
||||
Also add missing site_ids to the annotations.
|
||||
"""
|
||||
sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
|
||||
anns = await conn.fetch(sql, site_id, base_url)
|
||||
for ann in anns:
|
||||
if ann['ann_type'] == 'blacklist':
|
||||
return False
|
||||
if ann['ann_type'] == 'whitelist':
|
||||
return True
|
||||
# add missing site_ids
|
||||
if site_id and any([ann['site_id'] is None for ann in anns]):
|
||||
sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
|
||||
await conn.execute(sql, site_id, base_url)
|
||||
return None
|
||||
|
||||
|
||||
async def process_site(fetcher, conn: Connection, site: Site):
|
||||
"""
|
||||
Process a site: fetch and store more information.
|
||||
|
||||
Store external and internal links; find boilerplate texts;
|
||||
fetch sitemaps; fetch feeds; update date of last publication.
|
||||
"""
|
||||
if not site.id_: # only to satisfy typing
|
||||
return
|
||||
if site.links_ext:
|
||||
await _store_cross_site_links(conn, site.id_, site.links_ext)
|
||||
if site.links_int:
|
||||
paths = []
|
||||
for durl, (rel, _) in site.links_int.items():
|
||||
canon = (rel and rel.lower() == 'canonical') or None
|
||||
paths.append((durl.pwa(), canon))
|
||||
await add_site_paths(conn, site.id_, paths)
|
||||
|
||||
await store_boilerplate_texts(fetcher, conn, site)
|
||||
|
||||
# get sitemaps and add their resources
|
||||
robots = await RobotsInfo(site.base_url) # type: ignore
|
||||
urls = await get_sitemap_urls(
|
||||
fetcher, site.base_url, sitemaps=robots.site_maps
|
||||
)
|
||||
paths_, latest = extract_sitemap_paths(site.base_url, urls)
|
||||
await add_site_paths(conn, site.id_, paths_)
|
||||
|
||||
# store feeds and their resources
|
||||
await store_new_feeds(conn, site.id_, site.feeds)
|
||||
latest_ = await fetch_feeds(fetcher, conn, site)
|
||||
if latest_:
|
||||
latest = max(latest or latest_, latest_)
|
||||
|
||||
# update last_pub
|
||||
if latest:
|
||||
site.last_pub = latest
|
||||
await site.save(conn)
|
||||
|
||||
|
||||
async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
|
||||
"""
|
||||
Unlock the site and schedule next crawl.
|
||||
|
||||
*crawl* is the crawl that has just finished (regularly or stopped).
|
||||
|
||||
If the crawl was stopped (t_end is None), just unlock the site.
|
||||
|
||||
Otherwise schedule a crawl of the same type. After a full crawl
|
||||
also a feed crawl is scheduled, if there was none scheduled.
|
||||
"""
|
||||
if crawl.t_end is None:
|
||||
sql = "UPDATE site SET crawl_active=false WHERE id=$1"
|
||||
await conn.execute(sql, site.id_)
|
||||
elif crawl.is_full:
|
||||
full_interval = app.config['crawl']['full_crawl_interval']
|
||||
feed_interval = app.config['crawl']['feed_crawl_interval']
|
||||
next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
|
||||
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
|
||||
sql = (
|
||||
"UPDATE site SET crawl_active=false, next_full_crawl=$1,"
|
||||
" next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
|
||||
)
|
||||
await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
|
||||
else:
|
||||
feed_interval = app.config['crawl']['feed_crawl_interval']
|
||||
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
|
||||
sql = (
|
||||
"UPDATE site SET crawl_active=false, next_feed_crawl=$1"
|
||||
" WHERE id=$2"
|
||||
)
|
||||
await conn.execute(sql, next_feed_crawl, site.id_)
|
||||
|
||||
|
||||
async def _store_cross_site_links(
|
||||
conn: Connection,
|
||||
site_id: int,
|
||||
links: dict[Durl, tuple[list[str], str]],
|
||||
) -> None:
|
||||
"""
|
||||
Put outgoing links into site_link/site_queue for existing/unknown sites.
|
||||
|
||||
Separate outgoing links from *site_id* into two classes:
|
||||
(a) existing sites (rows in table site) and (b) unknown links.
|
||||
Add links from class (a) to table site_link.
|
||||
Add links from class (b) to table site_queue.
|
||||
"""
|
||||
# add outgoing cross-site links for existing sites to table site_link
|
||||
urls = [url.site() for url in links.keys()]
|
||||
values = []
|
||||
sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
|
||||
if rows := await conn.fetch(sql, urls):
|
||||
for row in rows:
|
||||
if (durl := await Durl(row['url'])) in links.keys():
|
||||
_, link_text = links.pop(durl)
|
||||
if site_id != row['id']:
|
||||
values.append((site_id, row['id'], link_text))
|
||||
sql = (
|
||||
"INSERT INTO site_link (src, dst, link_text)"
|
||||
" VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
|
||||
)
|
||||
await conn.executemany(sql, values)
|
||||
|
||||
# add outgoing cross-site links for unknown sites to table site_queue
|
||||
sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
|
||||
values = [
|
||||
(site_id, durl.site()[:200], link_text[:100])
|
||||
for durl, (_, link_text) in links.items()
|
||||
]
|
||||
await conn.executemany(sql, values)
|
255
src/atextcrawler/site/parse.py
Normal file
255
src/atextcrawler/site/parse.py
Normal file
|
@ -0,0 +1,255 @@
|
|||
"""
|
||||
Parsing of a site's startpage.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..models import Site, TextResource
|
||||
from ..resource import feed_types
|
||||
from ..utils.durl import Durl, get_ips
|
||||
from ..utils.html import clean_html
|
||||
from ..utils.lang import clean_lang
|
||||
from ..utils.link import (
|
||||
extract_domain,
|
||||
in_blacklist,
|
||||
link_rels,
|
||||
meta_names,
|
||||
meta_props,
|
||||
)
|
||||
|
||||
re_meta_keyword_sep = re.compile('[,;\r\n]')
|
||||
|
||||
|
||||
def cut_str(s: Optional[str], l: int) -> Optional[str]:
|
||||
"""
|
||||
Cut a string *s* to a maximal length *l* from the left.
|
||||
"""
|
||||
return s[:l] if s else None
|
||||
|
||||
|
||||
async def parse_startpage(
|
||||
startpage: TextResource, app=None, site=None
|
||||
) -> Site:
|
||||
"""
|
||||
Parse a site's startpage and return a Site instance.
|
||||
|
||||
If a site instance is given, update it.
|
||||
"""
|
||||
durl = startpage.init_fields['durl']
|
||||
soup = startpage.init_fields['head']
|
||||
meta = collect_meta_tags(soup)
|
||||
meta_links = await collect_meta_links(soup, durl)
|
||||
links_ext = await collect_external_links(startpage, meta_links)
|
||||
links_int = startpage.init_fields['links_int']
|
||||
langs = extract_languages(startpage, meta, meta_links)
|
||||
title, description, keywords = extract_meta_texts(startpage, meta)
|
||||
|
||||
# feeds
|
||||
feeds = meta_links['feeds']
|
||||
if 'wordpress' in meta.get('generator', '').lower():
|
||||
url = durl.site() + 'feed/'
|
||||
feeds[url] = 'application/rss+xml'
|
||||
# TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
|
||||
|
||||
# network params (canonical_url, base_urls, domains)
|
||||
ips = await get_ips(durl.hostname)
|
||||
redirects = []
|
||||
for redirect in startpage.init_fields['redirects']:
|
||||
redir_url = await Durl(redirect)
|
||||
if redir_url:
|
||||
redirects.append(redir_url.site())
|
||||
base_urls = redirects + [durl.url()]
|
||||
domains = [extract_domain(durl.hostname)]
|
||||
|
||||
if site: # update an existing Site
|
||||
site.canonical_url = meta_links['canonical_url'] or site.canonical_url
|
||||
site.base_urls = base_urls
|
||||
site.domains = domains
|
||||
site.ips = ips
|
||||
site.last_update = datetime.utcnow()
|
||||
site.last_pub = startpage.last_change
|
||||
site.langs = langs
|
||||
site.alt_langs = meta_links['alt_langs']
|
||||
site.title = title
|
||||
site.description = description
|
||||
site.keywords = keywords
|
||||
site.linkbacks.update(meta_links['linkbacks'])
|
||||
site.meta_info = meta
|
||||
site.__post_init__(
|
||||
base_durl=durl,
|
||||
feeds=feeds,
|
||||
links_ext=links_ext,
|
||||
links_int=links_int,
|
||||
startpage_text=startpage.search_fields['text'],
|
||||
)
|
||||
else: # create new Site instance
|
||||
site = Site(
|
||||
# post_init fields
|
||||
base_durl=durl,
|
||||
feeds=feeds,
|
||||
links_ext=links_ext,
|
||||
links_int=links_int,
|
||||
startpage_text=startpage.search_fields['text'],
|
||||
# dataclass fields
|
||||
canonical_url=meta_links['canonical_url'],
|
||||
base_urls=base_urls,
|
||||
domains=domains,
|
||||
ips=ips,
|
||||
last_update=datetime.utcnow(),
|
||||
last_pub=startpage.last_change,
|
||||
langs=list(langs),
|
||||
alt_langs=meta_links['alt_langs'],
|
||||
title=title,
|
||||
description=description,
|
||||
keywords=keywords,
|
||||
linkbacks=meta_links['linkbacks'],
|
||||
meta_info=meta,
|
||||
)
|
||||
if site.ips is None and site.url:
|
||||
site.ips = await get_ips(site.url.hostname)
|
||||
if app and site.startpage_text:
|
||||
site_filter = app.plugins['filter_site'].site_filter
|
||||
site.crawl_enabled = await site_filter(site)
|
||||
return site
|
||||
|
||||
|
||||
def collect_meta_tags(soup):
|
||||
"""
|
||||
Collect selected meta tags (meta_names and meta_props) with their values.
|
||||
"""
|
||||
meta = {}
|
||||
for tag in soup.find_all('meta'):
|
||||
if (name := tag.get('name')) and name in meta_names:
|
||||
meta[name] = tag.get('content')
|
||||
if (property := tag.get('property')) in meta_props:
|
||||
if content := tag.get('content'):
|
||||
meta[property] = content
|
||||
if tag.get('http-equiv') == 'content-language': # old html
|
||||
if content := tag.get('content'):
|
||||
meta['http_equiv_lang'] = content
|
||||
return meta
|
||||
|
||||
|
||||
async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
|
||||
"""
|
||||
Collect link tags with site scope (feeds, linkbacks, canonical, ...).
|
||||
"""
|
||||
linkbacks = {}
|
||||
feeds = {}
|
||||
alt_langs = {}
|
||||
canonical_url = None
|
||||
for tag in soup.find_all('link'):
|
||||
if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
|
||||
continue
|
||||
if not (url := tag.get('href')):
|
||||
continue
|
||||
if not (link_durl := await Durl(url, base=base_durl)):
|
||||
continue
|
||||
if in_blacklist(link_durl.hostname):
|
||||
continue
|
||||
link_url = link_durl.url()
|
||||
link_type = tag.get('type')
|
||||
if link_type in feed_types:
|
||||
feeds[link_url] = link_type
|
||||
elif 'canonical' in rels:
|
||||
canonical_url = link_url
|
||||
elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
|
||||
if lang := clean_lang(hreflang):
|
||||
alt_langs[lang] = link_durl.url()
|
||||
elif 'webmention' in rels:
|
||||
linkbacks[link_url] = 'webmention'
|
||||
elif 'pingback' in rels:
|
||||
linkbacks[link_url] = 'pingback'
|
||||
if canonical_url:
|
||||
if canonical_durl := await Durl(canonical_url):
|
||||
canonical_url = canonical_durl.site()
|
||||
else:
|
||||
canonical_url = None
|
||||
return {
|
||||
'feeds': feeds,
|
||||
'linkbacks': linkbacks,
|
||||
'alt_langs': alt_langs,
|
||||
'canonical_url': canonical_url,
|
||||
}
|
||||
|
||||
|
||||
async def collect_external_links(startpage, meta_links) -> dict[str, str]:
|
||||
"""
|
||||
Return external links (mapping from URL to link text) from startpage.
|
||||
|
||||
Also add links to alternate language variants of the site.
|
||||
"""
|
||||
external_links = startpage.init_fields['links_ext'].copy()
|
||||
netloc = startpage.init_fields['durl'].netloc
|
||||
for lang, lang_url in meta_links['alt_langs'].items():
|
||||
if netloc not in lang_url:
|
||||
durl = await Durl(lang_url)
|
||||
if durl:
|
||||
external_links[durl] = f'Alternate language: {lang}'
|
||||
return external_links
|
||||
|
||||
|
||||
def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
|
||||
"""
|
||||
Extract and return title, description, keywords from a page and meta tags.
|
||||
"""
|
||||
title = meta.get('og:site_name')
|
||||
if not title:
|
||||
title = page.search_fields['title'] or ''
|
||||
if meta_title := meta.pop('title', None):
|
||||
if meta_title.lower() not in title.lower():
|
||||
title += ('; ' if title else '') + meta_title
|
||||
title = cut_str(clean_html(title), 200)
|
||||
description = cut_str(clean_html(meta.pop('description', None)), 2000)
|
||||
if meta_keywords := meta.pop('keywords', None):
|
||||
kws = re_meta_keyword_sep.split(meta_keywords)
|
||||
keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
|
||||
if len(keywords) < 2:
|
||||
keywords = [
|
||||
kw.strip()[:50]
|
||||
for kw in meta_keywords.split(' ')
|
||||
if kw.strip()
|
||||
]
|
||||
else:
|
||||
keywords = []
|
||||
return title, description, keywords
|
||||
|
||||
|
||||
def extract_languages(page, meta, meta_links) -> set[str]:
|
||||
"""
|
||||
Extract languages from a page's html tag, meta tags and HTTP headers.
|
||||
|
||||
Also add the language detected in the text content of the page.
|
||||
|
||||
Return a set of ISO 639-1 language codes.
|
||||
|
||||
See also https://www.w3.org/International/questions/qa-http-and-lang and
|
||||
https://www.w3.org/International/questions/qa-html-language-declarations
|
||||
"""
|
||||
languages = set()
|
||||
if lang := clean_lang(page.lang):
|
||||
languages.add(lang)
|
||||
if lang := clean_lang(meta.get('http_equiv_lang')):
|
||||
languages.add(lang)
|
||||
if lang := clean_lang(meta.get('dc.language')):
|
||||
languages.add(lang)
|
||||
if lang := clean_lang(meta.get('og:locale')):
|
||||
languages.add(lang)
|
||||
for lang, lang_url in meta_links['alt_langs'].items():
|
||||
if page.init_fields['durl'].netloc in lang_url:
|
||||
if lng := clean_lang(lang):
|
||||
languages.add(lng)
|
||||
lngs = (
|
||||
page.init_fields['headers']
|
||||
.get('Content-Language', '')
|
||||
.lower()
|
||||
.replace(' ', '')
|
||||
.split(',')
|
||||
)
|
||||
for lng in lngs:
|
||||
if lang := clean_lang(lng):
|
||||
languages.add(lang)
|
||||
languages.add(page.lang)
|
||||
return languages
|
127
src/atextcrawler/site/queue.py
Normal file
127
src/atextcrawler/site/queue.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
"""
|
||||
Queue of sites.
|
||||
|
||||
When processing a resource, its external links are put into database table
|
||||
`site_queue`.
|
||||
The items in `site_queue` are processed in :func:`process_site_queue`.
|
||||
This is done baseURL by baseURL (see :func:`iter_site_queue`).
|
||||
While doing this, cross-site links are put into table `site_link`.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import AsyncIterator, Optional
|
||||
|
||||
import aiohttp
|
||||
from asyncpg import Connection
|
||||
|
||||
from ..resource import ResourceFetcher
|
||||
from .operations import update_site
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def process_site_queue(app, pool):
|
||||
"""
|
||||
Loop over queued sites creating new sites and adding cross-site links.
|
||||
"""
|
||||
site_delay = app.config['crawl']['site_delay']
|
||||
resource_delay = app.config['crawl']['resource_delay']
|
||||
async with pool.acquire() as conn:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
fetcher = ResourceFetcher(session)
|
||||
while app.running:
|
||||
async for base_url, links_from in iter_site_queue(app, conn):
|
||||
# get or create site
|
||||
msg = f'Site queue: updating {base_url}'
|
||||
logger.debug(msg)
|
||||
site, created = await update_site(
|
||||
app, fetcher, conn, base_url
|
||||
)
|
||||
if site:
|
||||
await store_incoming_site_site_links(
|
||||
conn, site.id_, links_from
|
||||
)
|
||||
# delete handled queue items
|
||||
sql = "DELETE FROM site_queue WHERE url=$1"
|
||||
await conn.execute(sql, base_url)
|
||||
await app.sleep(resource_delay)
|
||||
logger.debug(
|
||||
f'Queued sites exhausted, sleeping'
|
||||
f' for {site_delay} seconds'
|
||||
)
|
||||
await app.sleep(site_delay)
|
||||
|
||||
|
||||
async def iter_site_queue(
|
||||
app, conn: Connection
|
||||
) -> AsyncIterator[tuple[str, dict[int, str]]]:
|
||||
"""
|
||||
Yield URLs with aggregated link information from site_queue.
|
||||
|
||||
Yield a URL and a dict mapping ids of linking sites to link texts.
|
||||
"""
|
||||
site_revisit_interval = app.config['crawl']['site_revisit_interval']
|
||||
while app.running:
|
||||
sql = (
|
||||
"SELECT url, array_agg(src) srcs,"
|
||||
" array_agg(link_text) link_texts"
|
||||
" FROM site_queue GROUP BY url LIMIT 1"
|
||||
)
|
||||
row = await conn.fetchrow(sql)
|
||||
if row:
|
||||
base_url = row['url']
|
||||
links_from = {}
|
||||
srcs = row['srcs']
|
||||
link_texts = row['link_texts']
|
||||
for i in range(len(srcs)):
|
||||
if src := srcs[i]:
|
||||
links_from[src] = link_texts[i]
|
||||
if site_id := await site_recently_updated(
|
||||
conn, base_url, site_revisit_interval
|
||||
):
|
||||
# just store incoming links and remove the site from the queue
|
||||
await store_incoming_site_site_links(conn, site_id, links_from)
|
||||
sql = "DELETE FROM site_queue WHERE url=$1"
|
||||
await conn.execute(sql, base_url)
|
||||
else:
|
||||
yield base_url, links_from
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
async def site_recently_updated(
|
||||
conn: Connection,
|
||||
base_url: str,
|
||||
site_revisit_interval: float,
|
||||
) -> Optional[int]:
|
||||
"""
|
||||
Return the id of the site with given base_url if it was updated recently.
|
||||
"""
|
||||
sql = (
|
||||
f"SELECT id FROM site WHERE $1=any(base_urls)"
|
||||
f" AND last_update + interval '{site_revisit_interval} seconds'"
|
||||
f" > now() at time zone 'utc' LIMIT 1"
|
||||
)
|
||||
site_id = await conn.fetchval(sql, base_url)
|
||||
return site_id
|
||||
|
||||
|
||||
async def store_incoming_site_site_links(
|
||||
conn: Connection, site_id: int, links_from: dict
|
||||
):
|
||||
"""
|
||||
Store incoming site-site links (irrespective of crawl_enabled).
|
||||
|
||||
*site_id* is the id of the site to which the links in *links_from* point.
|
||||
"""
|
||||
sql = (
|
||||
"INSERT INTO site_link"
|
||||
" (src, dst, link_text) VALUES ($1, $2, $3)"
|
||||
" ON CONFLICT (src, dst) DO NOTHING"
|
||||
)
|
||||
values = [
|
||||
(from_id, site_id, link_text)
|
||||
for from_id, link_text in links_from.items()
|
||||
if from_id != site_id
|
||||
]
|
||||
await conn.executemany(sql, values)
|
98
src/atextcrawler/site/robots.py
Normal file
98
src/atextcrawler/site/robots.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
"""
|
||||
Fetch and evaluate a website's robots.txt.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional, Union
|
||||
from urllib.robotparser import RobotFileParser
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsInfo(RobotFileParser):
|
||||
"""
|
||||
Obtain information from a site's robots.txt.
|
||||
|
||||
After instantiation you must await :meth:`startup`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
site_url: str,
|
||||
user_agent: str = '*',
|
||||
session: aiohttp.ClientSession = None,
|
||||
):
|
||||
super().__init__()
|
||||
self.__user_agent = user_agent
|
||||
self.__site_url = site_url.rstrip('/')
|
||||
self.__robots_url = self.__site_url + '/robots.txt'
|
||||
self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
|
||||
self.__session = session
|
||||
|
||||
def __await__(self):
|
||||
return self.__ainit__().__await__()
|
||||
|
||||
async def __ainit__(self):
|
||||
if self.__session:
|
||||
content = await self.__get_robots_txt(self.__session)
|
||||
else:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
content = await self.__get_robots_txt(session)
|
||||
self.parse(content.splitlines())
|
||||
self.__delay = self.crawl_delay(self.__user_agent)
|
||||
request_rate = self.request_rate(self.__user_agent)
|
||||
if request_rate:
|
||||
self.__delay = request_rate.seconds / request_rate.requests
|
||||
self.__site_maps = super().site_maps() or []
|
||||
return self
|
||||
|
||||
async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
|
||||
"""
|
||||
Fetch and return the robots.txt over http.
|
||||
"""
|
||||
try:
|
||||
async with session.get(
|
||||
self.__robots_url, timeout=self.__timeout
|
||||
) as resp:
|
||||
if resp.status == 200:
|
||||
try:
|
||||
content = await resp.text()
|
||||
except:
|
||||
body = await resp.read()
|
||||
content = body.decode(
|
||||
resp.charset or 'utf-8', errors='ignore'
|
||||
)
|
||||
else:
|
||||
content = ''
|
||||
except aiohttp.ClientError:
|
||||
content = ''
|
||||
return content
|
||||
|
||||
@property
|
||||
def user_agent(self) -> str:
|
||||
"""
|
||||
The user agent being used.
|
||||
"""
|
||||
return self.__user_agent
|
||||
|
||||
@property
|
||||
def delay(self) -> Optional[Union[int, float]]:
|
||||
"""
|
||||
The delay to be used between requests.
|
||||
"""
|
||||
return self.__delay
|
||||
|
||||
@property
|
||||
def site_maps(self) -> list[str]:
|
||||
"""
|
||||
The list of sitemaps of the site.
|
||||
"""
|
||||
return self.__site_maps
|
||||
|
||||
def can_fetch_url(self, url: str) -> bool:
|
||||
"""
|
||||
Return whether fetching of the given *url* is allowed.
|
||||
"""
|
||||
return super().can_fetch(self.__user_agent, url)
|
72
src/atextcrawler/site/seed.py
Normal file
72
src/atextcrawler/site/seed.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
"""
|
||||
Seeding of new installations with URLs from blacklists and whitelists.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import asyncpg
|
||||
|
||||
from ..utils.durl import Durl
|
||||
|
||||
|
||||
async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
|
||||
"""
|
||||
Add seed file contents (site blacklist and whitelist).
|
||||
|
||||
If there are sites already, do nothing.
|
||||
"""
|
||||
async with pool.acquire() as conn:
|
||||
site_count = await conn.fetchval("SELECT count(*) FROM site")
|
||||
if site_count:
|
||||
return
|
||||
|
||||
# add blacklist entries
|
||||
values = []
|
||||
blacklist = _load_list(config['config_dir'], 'black')
|
||||
for base_url in blacklist:
|
||||
durl = await Durl(base_url)
|
||||
if durl:
|
||||
url = durl.site()
|
||||
values.append((url, {'source': 'seed file'}))
|
||||
sql = (
|
||||
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
|
||||
" VALUES ($1, 'blacklist', $2)"
|
||||
)
|
||||
await conn.executemany(sql, values)
|
||||
|
||||
# add whitelist entries
|
||||
values1 = []
|
||||
values2 = []
|
||||
whitelist = _load_list(config['config_dir'], 'white')
|
||||
for base_url in whitelist:
|
||||
durl = await Durl(base_url)
|
||||
if durl:
|
||||
url = durl.site()
|
||||
if url not in blacklist:
|
||||
values1.append((url, {'source': 'seed file'}))
|
||||
values2.append((url,))
|
||||
sql = (
|
||||
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
|
||||
" VALUES ($1, 'whitelist', $2)"
|
||||
)
|
||||
await conn.executemany(sql, values1)
|
||||
sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
|
||||
await conn.executemany(sql, values2)
|
||||
|
||||
|
||||
def _load_list(config_dir, black_white):
|
||||
"""
|
||||
Load the seed black or white list.
|
||||
"""
|
||||
path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
|
||||
with open(path, 'r') as list_file:
|
||||
urls = []
|
||||
for line in list_file.read().strip().splitlines():
|
||||
line_ = line.strip()
|
||||
if line_.startswith('#'):
|
||||
continue
|
||||
if black_white == 'black' and line_.startswith('-'):
|
||||
urls.append(line_[1:].strip())
|
||||
if black_white == 'white' and line_.startswith('+'):
|
||||
urls.append(line_[1:].strip())
|
||||
return urls
|
69
src/atextcrawler/tensorflow.py
Normal file
69
src/atextcrawler/tensorflow.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
"""
|
||||
Query the tensorflow_model_server's REST API.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional, Union
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TensorFlow:
|
||||
"""
|
||||
Fetch an embedding vector from the tensorflow model server.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
app,
|
||||
session: aiohttp.ClientSession,
|
||||
timeout_sock_connect: Union[int, float] = 0.5,
|
||||
timeout_sock_read: Union[int, float] = 10,
|
||||
):
|
||||
self.config = app.config['tensorflow']
|
||||
self.session = session
|
||||
self.timeout = aiohttp.ClientTimeout(
|
||||
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
|
||||
)
|
||||
|
||||
async def embed(
|
||||
self, text: Union[str, list[str]]
|
||||
) -> Optional[Union[list[float], list[list[float]]]]:
|
||||
"""
|
||||
Query the tensorflow_model_server's REST API for a prediction.
|
||||
|
||||
Take a string or a list of strings and return an embedding vector
|
||||
or a list of embedding vectors.
|
||||
|
||||
If the request fails or times out, return None.
|
||||
"""
|
||||
text_ = text if isinstance(text, list) else [text]
|
||||
data = {'signature_name': 'serving_default', 'instances': text_}
|
||||
try:
|
||||
async with self.session.post(
|
||||
self.config['model_server_endpoint'],
|
||||
json=data,
|
||||
timeout=self.timeout,
|
||||
) as resp:
|
||||
try:
|
||||
res = await resp.json()
|
||||
if isinstance(text, list):
|
||||
return res.get('predictions')
|
||||
else:
|
||||
return res.get('predictions')[0]
|
||||
except:
|
||||
msg = 'Got invalid response from tensorflow'
|
||||
logger.error(msg)
|
||||
return None
|
||||
except Exception as err:
|
||||
msg = 'Could not get embedding from tensorflow for '
|
||||
if isinstance(text, str):
|
||||
msg += f'string of length {len(text)}'
|
||||
else:
|
||||
msg += 'list of strings with lengths '
|
||||
msg += ','.join([str(len(s)) for s in text])
|
||||
msg += f', reason: {err}'
|
||||
logger.error(msg)
|
||||
return None
|
0
src/atextcrawler/utils/__init__.py
Normal file
0
src/atextcrawler/utils/__init__.py
Normal file
481
src/atextcrawler/utils/annotation.py
Normal file
481
src/atextcrawler/utils/annotation.py
Normal file
|
@ -0,0 +1,481 @@
|
|||
"""
|
||||
Convert html to plain text with annotations over character ranges.
|
||||
"""
|
||||
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from .json import json_dumps, json_loads
|
||||
from .link import nofollow_link_rels
|
||||
from .tag import keep_tags, self_closing_tags
|
||||
|
||||
MAX_HREF_LENGTH = 200
|
||||
"""
|
||||
Maximum length of an href. Other links are discarded.
|
||||
"""
|
||||
|
||||
|
||||
text_blacklist = [
|
||||
'previous',
|
||||
'next',
|
||||
'back', # common pagination navigation
|
||||
'↩︎', # amusewiki footnote separator (after conversion from muse to html)
|
||||
]
|
||||
"""
|
||||
Texts to ignore.
|
||||
"""
|
||||
|
||||
|
||||
class AnnotatingParser(HTMLParser):
|
||||
"""
|
||||
Parse tagged text resulting in pure text and annotations.
|
||||
|
||||
The text is available in self.text and the annotations
|
||||
in self.annotations, which is a dict with these keys:
|
||||
|
||||
* tags: contains a mapping of offset ranges (i, f) to
|
||||
the tags opening at i and closing at f
|
||||
* semantic_breaks: a mapping of offset positions where
|
||||
a new section begins to the nesting level of that
|
||||
sections; a section is whereever an (opening or closing)
|
||||
separating tag is placed in the raw html; for the
|
||||
separating flag of tags see tag.py
|
||||
* links: a mapping of hrefs to link texts obtained from
|
||||
anchor (a) tags; we skip hyperref with nofollow rels
|
||||
* section_ids: map an offset position to the first
|
||||
id attribute (of any tag) at the beginning of a
|
||||
semantic section; this can later be used in a URL
|
||||
fragment for linking directly into this section
|
||||
|
||||
Internally, we put opening tags on self.stack and pop them
|
||||
when the first matching closing tag is encountered. We assume
|
||||
balanced tags (tidy html).
|
||||
|
||||
NB: all tags with semantic breaks have sep=True, i.e.,
|
||||
they will have spaces around them so that the semantic breaks
|
||||
always sit on a space; the semantic break position p is the end
|
||||
of the last section and the next sections begins at p + 1.
|
||||
|
||||
The text alway begins with a ' ' (added if not in the original),
|
||||
which is assigned a semantic break with default level 80
|
||||
(if there is no semantic break tag at the beginning).
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.text = ' ' # concatenated text data (without tags)
|
||||
self.pos = 1 # equal to len(self.text)
|
||||
self.stack = []
|
||||
self.tags = defaultdict(dict)
|
||||
self.semantic_breaks = {0: 80}
|
||||
self.tag_id = None
|
||||
self.section_ids = defaultdict(list)
|
||||
self.links = {}
|
||||
self.add_space = False
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Finish by collecting results in dict `self.annotations`.
|
||||
"""
|
||||
super().close()
|
||||
self.annotations = {}
|
||||
self.annotations['links'] = self.links
|
||||
self.annotations['semantic_breaks'] = {
|
||||
pos: lvl for pos, lvl in sorted(self.semantic_breaks.items())
|
||||
}
|
||||
self.annotations['tags'] = self.tags
|
||||
self.annotations['section_ids'] = self.section_ids
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
"""
|
||||
Called for each opening tag.
|
||||
"""
|
||||
sep, lvl, sem = keep_tags[tag]
|
||||
attrs = dict(attrs)
|
||||
if sep:
|
||||
self.add_space = True
|
||||
if tag == 'section' and 'endnotes' in attrs.get('role', ''):
|
||||
lvl = 25
|
||||
# ARIA roles
|
||||
if role := attrs.get('role'):
|
||||
if role == 'article':
|
||||
lvl = 15
|
||||
elif role == 'heading':
|
||||
if aria_level := attrs.get('aria-level'):
|
||||
if aria_level in (1, 2, 3, 4, 5, 6):
|
||||
sep, lvl, sem = keep_tags[f'h{aria_level}']
|
||||
elif role == 'region':
|
||||
lvl = 24
|
||||
i = self.pos
|
||||
if tag in self_closing_tags:
|
||||
# self-closing tags will not be added to the result tags,
|
||||
# they only appear in semantic_breaks
|
||||
# the two self-closing tags br and hr both have lvl and sep
|
||||
if i == 1: # replace the default semantic break at pos 0
|
||||
i = 0
|
||||
self.add_semantic_break(i, lvl)
|
||||
i += 1
|
||||
if tag_id := attrs.get('id'):
|
||||
self.tag_id = i, tag_id
|
||||
self.add_tag_id(i) # br or hr may have an id, too
|
||||
self.add_space = True
|
||||
else:
|
||||
self.stack.append((i, tag, sep, lvl, sem, attrs))
|
||||
# forget outdated tag id at new semantic break
|
||||
if lvl:
|
||||
self.forget_tag_id()
|
||||
# memorize tag id
|
||||
if not self.tag_id and (tag_id := attrs.get('id')):
|
||||
self.tag_id = self.pos, tag_id
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
"""
|
||||
Called for each closing tag.
|
||||
"""
|
||||
if not self.stack or (self.stack and self.stack[-1][1] != tag):
|
||||
return # nothing to do for an already closed self-closing tag
|
||||
i, tag_, sep, lvl, sem, attrs = self.stack.pop()
|
||||
f = self.pos
|
||||
# omit tag without content
|
||||
if i == f:
|
||||
return
|
||||
# for a closing div tag revise lvl to minimum level of contained
|
||||
# semantic breaks (if any)
|
||||
if tag == 'div':
|
||||
min_lvl = 101
|
||||
for pos_, lvl_ in reversed(self.semantic_breaks.items()):
|
||||
if pos_ <= i:
|
||||
break
|
||||
min_lvl = min(min_lvl, lvl_)
|
||||
if min_lvl < 101:
|
||||
lvl = min_lvl
|
||||
# add semantic break and an optional section_id
|
||||
if lvl:
|
||||
if i == 1: # replace the default semantic break at pos 0
|
||||
i = 0
|
||||
if tag in ('ul', 'ol', 'li'):
|
||||
seen_tags = [x[1] for x in self.stack]
|
||||
if 'p' not in seen_tags:
|
||||
lvl = 52 + seen_tags.count('tag')
|
||||
if tag == 'li':
|
||||
lvl += 1
|
||||
self.add_semantic_break(i, lvl)
|
||||
self.add_tag_id(i)
|
||||
# do not include surrounding spaces in tag span
|
||||
if self.text[i] == ' ':
|
||||
i += 1
|
||||
# add tag
|
||||
self.tags[(i, f)][tag] = sem
|
||||
# add space (when handling next data)
|
||||
if sep:
|
||||
self.add_space = True
|
||||
# collect links
|
||||
if tag == 'a':
|
||||
self.extract_link(i, attrs)
|
||||
|
||||
def handle_data(self, text):
|
||||
"""
|
||||
Called for each non-tag content between tags.
|
||||
"""
|
||||
# handle empty or blacklisted text
|
||||
if text == '':
|
||||
return
|
||||
if text == ' ':
|
||||
self.add_space = True
|
||||
return
|
||||
if text.strip().lower() in text_blacklist:
|
||||
if ' ' in text:
|
||||
self.add_space = True
|
||||
return
|
||||
# add a space (at self.pos) if the text begins with one
|
||||
# or if we shall add one
|
||||
startswith_space = text.startswith(' ')
|
||||
text = text.lstrip()
|
||||
if startswith_space or self.add_space:
|
||||
if self.text[-1] != ' ':
|
||||
self.text += ' '
|
||||
self.pos += 1
|
||||
self.add_space = False
|
||||
# strip a space at the end of text and handle it in end tag
|
||||
if text.endswith(' '):
|
||||
text = text[:-1]
|
||||
self.add_space = True
|
||||
# add text to self.text
|
||||
self.text += text
|
||||
self.pos += len(text)
|
||||
|
||||
def add_semantic_break(self, pos, lvl):
|
||||
"""
|
||||
Add a semantic break of level *lvl* at position *pos*.
|
||||
"""
|
||||
if pos in self.semantic_breaks:
|
||||
self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl)
|
||||
else:
|
||||
self.semantic_breaks[pos] = lvl
|
||||
|
||||
def forget_tag_id(self):
|
||||
"""
|
||||
Reset a tag id if it is too far behind in the text stream.
|
||||
"""
|
||||
if self.tag_id:
|
||||
pos_, tag_id = self.tag_id
|
||||
if pos_ + 200 < self.pos:
|
||||
self.tag_id = None
|
||||
|
||||
def add_tag_id(self, pos):
|
||||
"""
|
||||
Add and clear an id if the just closing section has none yet.
|
||||
|
||||
*pos* is the start position of the current section, and the
|
||||
position where the id will be added.
|
||||
|
||||
Add an id only if we are not too far in the section's text already.
|
||||
"""
|
||||
if self.tag_id:
|
||||
pos_, tag_id = self.tag_id
|
||||
if pos_ < pos + 100 and pos not in self.section_ids:
|
||||
self.section_ids[pos].append(tag_id.lower())
|
||||
self.tag_id = None
|
||||
|
||||
def extract_link(self, i, attrs):
|
||||
"""
|
||||
Add a link covering character range (i, self.pos).
|
||||
|
||||
From html *attrs* extract href and rel.
|
||||
"""
|
||||
if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow':
|
||||
if href.startswith('#'):
|
||||
return
|
||||
if len(href) > MAX_HREF_LENGTH:
|
||||
return
|
||||
attrs.get('title', '')
|
||||
if rel := attrs.get('rel'):
|
||||
if set(rel) & nofollow_link_rels:
|
||||
return
|
||||
self.links[href] = i, self.pos, rel
|
||||
|
||||
|
||||
def annotate(html):
|
||||
"""
|
||||
Split html text into plain text with annotations (from AnnotatingParser).
|
||||
"""
|
||||
parser = AnnotatingParser()
|
||||
parser.reset()
|
||||
parser.feed(html)
|
||||
parser.close()
|
||||
return parser.text, parser.annotations
|
||||
|
||||
|
||||
re_footnote = re.compile(r'^\s*\[\d+\]\s+')
|
||||
|
||||
|
||||
def headline_probability(text, tags, lvl) -> float:
|
||||
"""
|
||||
Estimate the probability that the text with tags is a headline.
|
||||
|
||||
The context is not considered: The question is not whether the
|
||||
text is a headline for the following text.
|
||||
"""
|
||||
text = text.strip()
|
||||
res = 0.0
|
||||
if not text:
|
||||
return res
|
||||
if lvl < 60:
|
||||
return 1.0
|
||||
# if 'h1' in tags or 'h2' in tags or 'h3' in tags or\
|
||||
# 'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags:
|
||||
# return 1.0
|
||||
if len(text) < 80:
|
||||
res = 0.7
|
||||
else:
|
||||
res = 0.7 - 0.7 * (len(text) - 80) / 200
|
||||
if 'p' in tags:
|
||||
res -= 0.4
|
||||
if 'em' in tags:
|
||||
res += 0.3
|
||||
if 'a' in tags:
|
||||
res -= 0.1
|
||||
if text[-1] in '.:':
|
||||
res -= 0.3
|
||||
res -= 0.1 * text.count(', ')
|
||||
if re_footnote.match(text):
|
||||
res -= 0.4
|
||||
return max(res, 0.0)
|
||||
|
||||
|
||||
def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]:
|
||||
"""
|
||||
Return the info on the share of characters covered with one of the *tags*.
|
||||
|
||||
Only consider the characters between i and f of string *text*.
|
||||
|
||||
Return the number of tags that have an overlap in the specified region,
|
||||
the tag density in the region (fraction of covered characters by all),
|
||||
and the average number of covered chars per tag.
|
||||
|
||||
NB: If more than one tag name is given, then the fractional share
|
||||
may exceed 1.
|
||||
"""
|
||||
if i == f:
|
||||
return 0, 0.0, 0.0
|
||||
tag_count = 0
|
||||
covered_chars = 0
|
||||
for (s_i, s_f), anns in tags.items():
|
||||
if overlap := range_overlap(i, f - 1, s_i, s_f - 1):
|
||||
for ann in anns:
|
||||
if ann in tag_names:
|
||||
tag_count += 1
|
||||
covered_chars += overlap[1] - overlap[0]
|
||||
all_chars = f - i
|
||||
tag_density = covered_chars * 1.0 / all_chars
|
||||
avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0
|
||||
return tag_count, tag_density, avg_text_len
|
||||
|
||||
|
||||
def range_overlap(i1, f1, i2, f2):
|
||||
"""
|
||||
Return the overlap of both ranges (None if there is none).
|
||||
"""
|
||||
return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2))
|
||||
|
||||
|
||||
def annotations_remove_section(annotations, i, f):
|
||||
"""
|
||||
Remove section (i, f) from annotations and return result.
|
||||
"""
|
||||
new_annotations = {}
|
||||
d = f - i
|
||||
if not d:
|
||||
return annotations
|
||||
|
||||
# relocate tags
|
||||
new_tags = {}
|
||||
for (t_i, t_f), anns in annotations['tags'].items():
|
||||
n_i, n_f = cut_range(i, f, d, t_i, t_f)
|
||||
if n_i is not None:
|
||||
new_tags[(n_i, n_f)] = anns
|
||||
new_annotations['tags'] = new_tags
|
||||
|
||||
# relocate links
|
||||
new_links = {}
|
||||
for href, (l_i, l_f, rel) in annotations['links'].items():
|
||||
n_i, n_f = cut_range(i, f, d, l_i, l_f)
|
||||
if n_i is not None:
|
||||
new_links[href] = n_i, n_f, rel
|
||||
|
||||
# relocate semantic breaks and section_ids
|
||||
semantic_breaks = annotations['semantic_breaks']
|
||||
section_ids = annotations['section_ids']
|
||||
new_semantic_breaks = {}
|
||||
new_section_ids = {}
|
||||
for pos in sorted(semantic_breaks.keys()):
|
||||
level = semantic_breaks[pos]
|
||||
if i <= pos and pos < f:
|
||||
continue # discard
|
||||
elif f <= pos:
|
||||
new_semantic_breaks[pos - d] = level
|
||||
if pos in section_ids:
|
||||
new_section_ids[pos - d] = section_ids[pos]
|
||||
else:
|
||||
new_semantic_breaks[pos] = level
|
||||
if pos in section_ids:
|
||||
new_section_ids[pos] = section_ids[pos]
|
||||
|
||||
# collect and return results
|
||||
new_annotations['semantic_breaks'] = new_semantic_breaks
|
||||
new_annotations['section_ids'] = new_section_ids
|
||||
new_annotations['links'] = new_links
|
||||
return new_annotations
|
||||
|
||||
|
||||
def cut_range(i, f, d, t_i, t_f):
|
||||
"""
|
||||
Return the new coordinates of a text range (t_i,t_f) after cutting (i,f).
|
||||
|
||||
If (t_i,t_f) is fully within (i,f), return None, None.
|
||||
"""
|
||||
if t_f < i:
|
||||
return t_i, t_f
|
||||
elif t_i < i <= t_f <= f:
|
||||
return t_i, i
|
||||
elif t_i < i and f <= t_f:
|
||||
return t_i, t_f - d
|
||||
elif i <= t_i and t_f <= f:
|
||||
return None, None
|
||||
elif i <= t_i <= f < t_f:
|
||||
return i, t_f - d
|
||||
else: # f < t_i
|
||||
return t_i - d, t_f - d
|
||||
|
||||
|
||||
def clean_annotations(annotations: dict) -> None:
|
||||
"""
|
||||
Remove void stuff from annotations.
|
||||
"""
|
||||
cleaned_tags = {}
|
||||
for (i, f), anns in annotations['tags'].items():
|
||||
if f > i and anns:
|
||||
cleaned_tags[(i, f)] = anns
|
||||
annotations['tags'] = cleaned_tags
|
||||
|
||||
|
||||
def pack_annotations(annotations):
|
||||
"""
|
||||
Pack annotations to a special JSON string, reducing their volume a little.
|
||||
"""
|
||||
return json_dumps(
|
||||
{
|
||||
'tags': _pack_tags(annotations['tags']),
|
||||
'semantic_breaks': ','.join(
|
||||
[
|
||||
f'{pos}:{level}'
|
||||
for pos, level in annotations['semantic_breaks'].items()
|
||||
]
|
||||
),
|
||||
'section_ids': annotations['section_ids'],
|
||||
'links': annotations['links'],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _pack_tags(tags: dict) -> str:
|
||||
"""
|
||||
Utility function for packing tag information into a string.
|
||||
"""
|
||||
res = ''
|
||||
for (i, f), anns in tags.items():
|
||||
if anns:
|
||||
anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()])
|
||||
res += f'{i}-{f}:{anns_}\n'
|
||||
return res
|
||||
|
||||
|
||||
def unpack_annotations(json_text: str) -> dict:
|
||||
"""
|
||||
Unpack tag information from a string.
|
||||
"""
|
||||
annotations = json_loads(json_text)
|
||||
tags = {}
|
||||
for line in annotations['tags'].split('\n'):
|
||||
if line:
|
||||
range_, anns_ = line.split(':')
|
||||
i, f = range_.split('-')
|
||||
i = int(i)
|
||||
f = int(f)
|
||||
anns = {}
|
||||
if anns_:
|
||||
for ann_ in anns_.split(','):
|
||||
tag_, sem_ = ann_.split('=')
|
||||
anns[tag_] = sem_
|
||||
tags[(i, f)] = anns
|
||||
semantic_breaks = {}
|
||||
for sb_ in annotations['semantic_breaks'].split(','):
|
||||
pos_, lvl_ = sb_.split(':')
|
||||
semantic_breaks[int(pos_)] = int(lvl_)
|
||||
return {
|
||||
'tags': tags,
|
||||
'semantic_breaks': semantic_breaks,
|
||||
'section_ids': annotations['section_ids'],
|
||||
'links': annotations['links'],
|
||||
}
|
90
src/atextcrawler/utils/date_finder.py
Normal file
90
src/atextcrawler/utils/date_finder.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
"""
|
||||
Find date expressions in a string.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
p_day = r'(0?[1-9]|[12][0-9]|3[01])'
|
||||
p_month = r'(0?[1-9]|1[0-2])'
|
||||
p_year = r'(20\d\d|19\d\d)'
|
||||
sep = r'\D{1,2}'
|
||||
p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
|
||||
|
||||
|
||||
format_re = {
|
||||
'iso': (
|
||||
re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
|
||||
(1, 2, 3, 6, 7),
|
||||
),
|
||||
'dmy': (
|
||||
re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
|
||||
(3, 2, 1, 6, 7),
|
||||
),
|
||||
'mdy': (
|
||||
re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
|
||||
(3, 1, 2, 6, 7),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
lang_format = {
|
||||
'de': ('iso', 'dmy'),
|
||||
'en': ('iso', 'mdy'),
|
||||
None: ('iso', 'dmy', 'mdy'),
|
||||
}
|
||||
|
||||
|
||||
def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
|
||||
"""
|
||||
Extract the latest date compatible with the *lang* from *text*.
|
||||
|
||||
Only consider dates in the past.
|
||||
"""
|
||||
dates = extract_dates(text, lang=lang)
|
||||
return max(dates) if dates else None
|
||||
|
||||
|
||||
def extract_dates(text: str, lang: str = None) -> list[datetime]:
|
||||
"""
|
||||
Extract dates form a string, optionally limiting formats to a language.
|
||||
"""
|
||||
dates = []
|
||||
fmts = lang_format.get(lang, lang_format[None])
|
||||
for fmt in fmts:
|
||||
re_, slots = format_re[fmt]
|
||||
matches = re_.findall(text)
|
||||
if matches:
|
||||
for match in matches:
|
||||
try:
|
||||
date = datetime(
|
||||
int(match[slots[0]]),
|
||||
int(match[slots[1]]),
|
||||
int(match[slots[2]]),
|
||||
int(match[slots[3]] or 0),
|
||||
int(match[slots[4]] or 0),
|
||||
)
|
||||
if date <= datetime.utcnow():
|
||||
dates.append(date)
|
||||
except:
|
||||
pass
|
||||
return dates
|
||||
|
||||
|
||||
## from htmldate import find_date
|
||||
|
||||
# def extract_last_pub(html):
|
||||
# """
|
||||
# Return an estimate for the time of last content publication from html.
|
||||
# """
|
||||
# # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
|
||||
# lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
|
||||
# # publication date (from startpage)
|
||||
# try:
|
||||
# date_string = find_date(lxml_tree)
|
||||
# pd = date.fromisoformat(date_string)
|
||||
# last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
|
||||
# except:
|
||||
# last_pub = None
|
||||
# return last_pub
|
278
src/atextcrawler/utils/durl.py
Normal file
278
src/atextcrawler/utils/durl.py
Normal file
|
@ -0,0 +1,278 @@
|
|||
"""
|
||||
Hyperlink parsing.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
import tldextract
|
||||
from async_dns import types
|
||||
from async_dns.resolver import ProxyResolver
|
||||
from async_lru import alru_cache
|
||||
|
||||
from .link import in_blacklist
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
resolver = ProxyResolver(request_timeout=2)
|
||||
|
||||
|
||||
async_dns_logger = logging.getLogger('async_dns')
|
||||
async_dns_logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
extract = tldextract.TLDExtract(cache_dir=False)
|
||||
|
||||
|
||||
# tldextract uses filelock; set its loglevel to warning
|
||||
filelock_logger = logging.getLogger('filelock')
|
||||
filelock_logger.setLevel(logging.WARNING)
|
||||
|
||||
|
||||
class Durl:
|
||||
"""
|
||||
Decomposed URL, contains :class:`urllib.parse.SplitResult`.
|
||||
|
||||
When constructing this class, it has to be awaited, e.g.:
|
||||
|
||||
my_durl = await Durl('http://www.example.com/whatever')
|
||||
|
||||
The given URL will be decomposed, validated and normalized.
|
||||
If the URL is invalid, we return None instead of an instance.
|
||||
|
||||
If the given *base* is None, the URL must be absolute and
|
||||
the hostname must be valid (DNS lookup).
|
||||
|
||||
If the given URL is not absolute, an already decomposed (and thus
|
||||
valid) *base* Durl must be given; otherwise the URL is invalid.
|
||||
|
||||
The *base* Durl can contain a path (but no arguments or fragments),
|
||||
in which case the URL - if not absolute - must begin with this path.
|
||||
|
||||
The scheme must be http or https. If the URL begins with '//',
|
||||
'http:' is prepended.
|
||||
|
||||
If the hostname is longer than 90 characters, the URL is invalid.
|
||||
|
||||
Default port numbers (80 for http, 443 for https) are removed.
|
||||
|
||||
The hostname is changed to lower case. Spaces in the hostname
|
||||
make the URL invalid.
|
||||
|
||||
URL fragments are removed.
|
||||
"""
|
||||
|
||||
_url = None
|
||||
_base = None
|
||||
_match_base = False
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
base: Optional['Durl'] = None,
|
||||
match_base: bool = False,
|
||||
):
|
||||
self._url = url
|
||||
self._base = base
|
||||
self._match_base = match_base
|
||||
|
||||
def __await__(self):
|
||||
return self.__ainit__().__await__()
|
||||
|
||||
async def __ainit__(self):
|
||||
res = None
|
||||
try:
|
||||
# add missing scheme for urls beginning with '//'
|
||||
if self._url.startswith('//'):
|
||||
self._url = 'http:' + self._url
|
||||
# split the url
|
||||
durl = urlsplit(self._url)
|
||||
# remove default port numbers 80, 443
|
||||
netloc = durl.netloc
|
||||
if durl.port == 80 and durl.scheme == 'http':
|
||||
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
|
||||
if durl.port == 443 and durl.scheme == 'https':
|
||||
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
|
||||
if durl.hostname and durl.hostname != durl.netloc.lower():
|
||||
user_pass = ''
|
||||
if durl.username and durl.password:
|
||||
user_pass = f'{durl.username}:{durl.password}@'
|
||||
port = ''
|
||||
if durl.port:
|
||||
port = f':{durl.port}'
|
||||
netloc = f'{user_pass}{durl.hostname.lower()}{port}'
|
||||
durl = durl._replace(netloc=netloc)
|
||||
|
||||
if self._base:
|
||||
# if missing fill in scheme and netloc from base
|
||||
if not durl.scheme:
|
||||
durl = durl._replace(scheme=self._base.scheme)
|
||||
if not durl.netloc:
|
||||
durl = durl._replace(netloc=self._base.netloc)
|
||||
# if match_base, then set res only if the
|
||||
# url is compatible with base url
|
||||
if not self._match_base:
|
||||
res = durl
|
||||
else:
|
||||
if durl.netloc == self._base.netloc:
|
||||
if durl.scheme == self._base.scheme:
|
||||
if self._base.path not in ('/', ''):
|
||||
if durl.path.startswith(self._base.path):
|
||||
res = durl
|
||||
else:
|
||||
res = durl
|
||||
else:
|
||||
res = durl
|
||||
except:
|
||||
logger.exception(
|
||||
f'Durl init failed url={self._url}'
|
||||
f' base={self._base} match_base={self._match_base}'
|
||||
)
|
||||
res = None
|
||||
if res:
|
||||
res = res._replace(fragment='')
|
||||
if not res.hostname or len(res.hostname) > 90:
|
||||
res = None
|
||||
elif res.scheme not in ('https', 'http'):
|
||||
res = None
|
||||
elif ' ' in res.hostname or '.' not in res.hostname:
|
||||
res = None
|
||||
elif not (await get_ips(res.hostname)):
|
||||
res = None
|
||||
elif not res.path.startswith('/'):
|
||||
res = res._replace(path='/')
|
||||
if res:
|
||||
if res.fragment is None:
|
||||
res.fragment = ''
|
||||
self._durl = res
|
||||
return self
|
||||
self._durl = None
|
||||
|
||||
def __getattr__(self, attr):
|
||||
return getattr(self._durl, attr)
|
||||
|
||||
def url(self) -> str:
|
||||
"""
|
||||
Return the URL as string.
|
||||
"""
|
||||
return self._durl.geturl()
|
||||
|
||||
def pwa(self) -> str:
|
||||
"""
|
||||
Return the (base-relative) path with args of the Durl.
|
||||
"""
|
||||
if self._base and self._match_base:
|
||||
path = self._durl.path.removeprefix(self._base.path)
|
||||
else:
|
||||
path = self._durl.path
|
||||
qs = f'?{self._durl.query}' if self._durl.query else ''
|
||||
return f'{path}{qs}'.lstrip('/')
|
||||
|
||||
def has_path(self) -> bool:
|
||||
"""
|
||||
Return whether the Durl has a non-trivil path.
|
||||
"""
|
||||
return self._durl.path not in ('/', '')
|
||||
|
||||
def site(self) -> str:
|
||||
"""
|
||||
Return the site (base_url).
|
||||
"""
|
||||
return f'{self._durl.scheme}://{self._durl.netloc}/'
|
||||
|
||||
def domain(self) -> str:
|
||||
"""
|
||||
Return the domain of the Durl (wrong in case of second-level domains).
|
||||
"""
|
||||
levels = extract(self._durl.hostname)
|
||||
return '.'.join(levels[-2:]).lower()
|
||||
|
||||
def replace_scheme(self, scheme: str) -> None:
|
||||
"""
|
||||
Replace the scheme (must be 'http' or 'https').
|
||||
"""
|
||||
self._durl = self._durl._replace(scheme=scheme)
|
||||
|
||||
|
||||
@alru_cache(maxsize=1000)
|
||||
async def get_ips(hostname: str) -> set[str]:
|
||||
"""
|
||||
Return IPv4 and IPv6 addresses of the given hostname.
|
||||
"""
|
||||
ips = set()
|
||||
for type_ in (types.A, types.AAAA):
|
||||
try:
|
||||
res, cached = await resolver.query(hostname, type_)
|
||||
if res:
|
||||
if addr := res.get_record([type_]):
|
||||
ips.add(addr.data)
|
||||
except:
|
||||
pass
|
||||
return ips
|
||||
|
||||
|
||||
def get_url_variants(url: str) -> list[str]:
|
||||
"""
|
||||
Return variants of the URL.
|
||||
|
||||
Replace http with https and vice versa;
|
||||
prepend or remove 'www.' to or from the beginning of the hostname.
|
||||
"""
|
||||
if url.startswith('http://www.'):
|
||||
s = url.removeprefix('http://www.')
|
||||
return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
|
||||
elif url.startswith('http://'):
|
||||
s = url.removeprefix('http://')
|
||||
return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
|
||||
elif url.startswith('https://www.'):
|
||||
s = url.removeprefix('https://www.')
|
||||
return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
|
||||
elif url.startswith('https://'):
|
||||
s = url.removeprefix('https://')
|
||||
return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
|
||||
else:
|
||||
return [url]
|
||||
|
||||
|
||||
async def assort_links(
|
||||
links: dict[str, tuple[int, int, list[str]]],
|
||||
durl: Durl,
|
||||
text: str,
|
||||
base_url: str = None,
|
||||
) -> tuple[
|
||||
dict[str, tuple[int, int, list[str]]],
|
||||
dict[Durl, tuple[list[str], str]],
|
||||
dict[Durl, tuple[list[str], str]],
|
||||
]:
|
||||
"""
|
||||
Sort links into a cleaned, an internal and an external dict.
|
||||
|
||||
The cleaned dict maps absolute URLs to char ranges and relations.
|
||||
The internal dict maps absolute URLs to relations and the linked text.
|
||||
The external dict maps absolute URLs to relations and the linked text.
|
||||
The relations are link relations, e.g. rel="canonical".
|
||||
|
||||
The base_url is set, it is used to distinguish internal and external
|
||||
links. If it is not set, the base_url is obtained from *durl*.
|
||||
"""
|
||||
res_int = {}
|
||||
res_ext = {}
|
||||
if not base_url:
|
||||
base_url = durl.site().lower()
|
||||
base_durl = await Durl(base_url)
|
||||
cleaned_links = {}
|
||||
for href, (i, f, rel) in links.items():
|
||||
durl = await Durl(href, base=base_durl)
|
||||
if not durl:
|
||||
continue
|
||||
if durl.hostname and in_blacklist(durl.hostname):
|
||||
continue
|
||||
cleaned_links[durl.url()] = i, f, rel
|
||||
txt = text[i:f]
|
||||
if durl.site().lower() == base_url:
|
||||
res_int[durl] = rel, txt
|
||||
else:
|
||||
res_ext[durl] = rel, txt
|
||||
return cleaned_links, res_int, res_ext
|
136
src/atextcrawler/utils/html.py
Normal file
136
src/atextcrawler/utils/html.py
Normal file
|
@ -0,0 +1,136 @@
|
|||
"""
|
||||
Utilities for extracting information from html.
|
||||
"""
|
||||
|
||||
import re
|
||||
from html import unescape
|
||||
from typing import Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .lang import clean_lang
|
||||
from .tag import drop_roles, drop_tags, keep_tags
|
||||
|
||||
re_ = {
|
||||
'html_lang': re.compile(
|
||||
'<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
|
||||
),
|
||||
'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
|
||||
'strip': re.compile(
|
||||
'<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
|
||||
),
|
||||
'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
|
||||
'whitespace': re.compile('(\s| )+', re.S),
|
||||
'whitespace_': re.compile('\s| ?'), # allow broken  
|
||||
'whitespace_near_tag': re.compile(
|
||||
'\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
|
||||
'|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
|
||||
re.S,
|
||||
),
|
||||
'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
|
||||
'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
|
||||
'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
|
||||
}
|
||||
|
||||
|
||||
def whitespace_tag_tag(match_obj):
|
||||
"""
|
||||
Helper function for removing whitespace between tags.
|
||||
"""
|
||||
return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
|
||||
|
||||
|
||||
def clean_html(s: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
Clean an html string.
|
||||
|
||||
Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
|
||||
|
||||
See also: https://www.lesinskis.com/python-unicode-whitespace.html
|
||||
"""
|
||||
return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
|
||||
|
||||
|
||||
def get_html_lang(html: str) -> Optional[str]:
|
||||
"""
|
||||
Return the language, if any, found in the lang attribute of the html tag.
|
||||
"""
|
||||
m = re_['html_lang'].search(html)
|
||||
return clean_lang(m.group(1)) if m else None
|
||||
|
||||
|
||||
def extract_title(html: str) -> Optional[str]:
|
||||
"""
|
||||
Extract title tags from html returning their content as a string.
|
||||
"""
|
||||
if not (titles := re_['title'].findall(html)):
|
||||
return None
|
||||
titles = [clean_html(title) for title in reversed(titles) if title]
|
||||
return ' - '.join(titles).strip(' |')
|
||||
|
||||
|
||||
def clean_page(html):
|
||||
"""
|
||||
Remove unwanted tags including their content from html.
|
||||
|
||||
Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
|
||||
Also drop tags with attribute aria-hidden=true.
|
||||
|
||||
Return a beautiful soup.
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
for tag in drop_tags:
|
||||
for n in soup.find_all(tag):
|
||||
n.decompose()
|
||||
for n in soup.find_all(attrs={'aria-hidden': 'true'}):
|
||||
n.decompose()
|
||||
for role in drop_roles:
|
||||
for n in soup.find_all(attrs={'rel': role}):
|
||||
n.decompose()
|
||||
return soup
|
||||
|
||||
|
||||
def clean_body(body):
|
||||
"""
|
||||
Clean an html body.
|
||||
|
||||
Remove unwanted tags (keeping their content); remove empty tags;
|
||||
remove and replace whitespaces in several ways.
|
||||
|
||||
In the end the only whitespace is a space and there are no
|
||||
consecutive spaces.
|
||||
"""
|
||||
body = re_['strip'].sub(' ', body)
|
||||
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
|
||||
body = re_['whitespace'].sub(' ', body)
|
||||
while re_['empty_tag'].search(body):
|
||||
body = re_['empty_tag'].sub(r'\3', body)
|
||||
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
|
||||
body = re_['whitespace'].sub(' ', body)
|
||||
body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
|
||||
return body.strip().replace('\u00ad', '') # soft hyphen
|
||||
|
||||
|
||||
def get_html_redirect(html: str) -> Optional[str]:
|
||||
"""
|
||||
Return an html redirect in an http-equiv meta tag.
|
||||
|
||||
If none is found, return None.
|
||||
"""
|
||||
redir_url = None
|
||||
http_equivs = re_['http_equiv'].findall(html)
|
||||
for raw in http_equivs:
|
||||
tag = BeautifulSoup(raw, 'html.parser').meta
|
||||
if tag and tag.get('http-equiv', '').lower() == 'refresh':
|
||||
if content := tag.get('content'):
|
||||
try:
|
||||
_, redir_url = content.split(';')
|
||||
redir_url = (
|
||||
redir_url.strip()
|
||||
.removeprefix('url=')
|
||||
.removeprefix('URL=')
|
||||
.strip("'")
|
||||
)
|
||||
except:
|
||||
pass
|
||||
return redir_url
|
58
src/atextcrawler/utils/http.py
Normal file
58
src/atextcrawler/utils/http.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
"""
|
||||
Utility functions related to http.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from multidict import CIMultiDictProxy
|
||||
|
||||
from ..models import Site
|
||||
from .durl import Durl
|
||||
|
||||
re_ = {
|
||||
'link_header': re.compile(',\s*(?=<)'),
|
||||
'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
|
||||
'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
|
||||
}
|
||||
|
||||
|
||||
async def get_header_links(
|
||||
headers: CIMultiDictProxy,
|
||||
durl: Durl,
|
||||
site: Optional[Site],
|
||||
) -> dict[str, Optional[str]]:
|
||||
"""
|
||||
Extract canonical and shortlink links from http headers.
|
||||
|
||||
*durl* must be the Durl of the fetched page and *site* - i fnon None -
|
||||
must be the Site to which the page belongs.
|
||||
|
||||
Return a (default)dict with 'canonical' and 'shortlink' as keys.
|
||||
The values default to None.
|
||||
"""
|
||||
res = {}
|
||||
canonical = shortlink = None
|
||||
if 'link' in headers and (link_headers := headers.getall('link')):
|
||||
links = []
|
||||
for link_header in link_headers:
|
||||
links += re_['link_header'].split(link_header)
|
||||
url = durl.url()
|
||||
base_url = site.base_url if site else url
|
||||
base_durl = await Durl(base_url) if base_url else None
|
||||
for link in links:
|
||||
if not canonical and 'canonical' in link.lower():
|
||||
if re_['rel_canonical'].search(link):
|
||||
canon_url = link.strip().lstrip('<').split('>')[0]
|
||||
if canon_durl := await Durl(canon_url, base=base_durl):
|
||||
canonical = canon_durl.url()
|
||||
if not shortlink and 'shortlink' in link.lower():
|
||||
if re_['rel_shortlink'].search(link):
|
||||
short_url = link.strip().lstrip('<').split('>')[0]
|
||||
if short_durl := await Durl(short_url, base=base_durl):
|
||||
shortlink = short_durl.url()
|
||||
if canonical and shortlink:
|
||||
break
|
||||
res['canonical'] = canonical
|
||||
res['shortlink'] = shortlink
|
||||
return res
|
32
src/atextcrawler/utils/json.py
Normal file
32
src/atextcrawler/utils/json.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
"""
|
||||
Custom JSON encoder.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
|
||||
class JSONEncoderExt(json.JSONEncoder):
|
||||
"""
|
||||
Extended JSON encoder with encoding of sets as lists.
|
||||
"""
|
||||
|
||||
def default(self, obj):
|
||||
"""
|
||||
Encode sets as lists and everything else as by default.
|
||||
"""
|
||||
if isinstance(obj, set):
|
||||
return list(obj)
|
||||
return json.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
def json_dumps(obj):
|
||||
"""
|
||||
Encode an object to a JSON string using JSONEncoderExt.
|
||||
"""
|
||||
return json.dumps(obj, cls=JSONEncoderExt)
|
||||
|
||||
|
||||
json_loads = json.loads
|
||||
"""
|
||||
Decoding of JSON strings as by default.
|
||||
"""
|
44
src/atextcrawler/utils/lang.py
Normal file
44
src/atextcrawler/utils/lang.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
Utility functions related to languages.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import gcld3
|
||||
|
||||
asset_path = Path(__file__).parent.parent / 'assets'
|
||||
|
||||
|
||||
with open(asset_path / 'iso_639-1', 'r') as f:
|
||||
iso_639_1_codes = f.read().strip().split('\n')
|
||||
|
||||
|
||||
lang_detector = gcld3.NNetLanguageIdentifier(
|
||||
min_num_bytes=0, max_num_bytes=1000
|
||||
)
|
||||
|
||||
|
||||
def clean_lang(lang: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
Clean a language code string: it must be an ISO 639-1 code or None.
|
||||
"""
|
||||
if lang is None:
|
||||
return None
|
||||
lang = lang[:2].lower()
|
||||
if lang in iso_639_1_codes:
|
||||
return lang
|
||||
return None
|
||||
|
||||
|
||||
def extract_content_language(text: str) -> Optional[str]:
|
||||
"""
|
||||
Extract the language from a text.
|
||||
"""
|
||||
if len(text) < 10:
|
||||
return None
|
||||
lang = None
|
||||
lang_det = lang_detector.FindLanguage(text=text)
|
||||
if lang_det.is_reliable:
|
||||
lang = lang_det.language[:2]
|
||||
return lang
|
116
src/atextcrawler/utils/link.py
Normal file
116
src/atextcrawler/utils/link.py
Normal file
|
@ -0,0 +1,116 @@
|
|||
"""
|
||||
Hyperlinks (a href, link).
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import tldextract
|
||||
|
||||
nofollow_link_rels = set(
|
||||
[
|
||||
'nofollow',
|
||||
'search',
|
||||
'noreferrer',
|
||||
'noopener',
|
||||
'help',
|
||||
'license',
|
||||
]
|
||||
)
|
||||
"""
|
||||
Do not follow the hrefs in anchor tags with these values of the rel attribute.
|
||||
"""
|
||||
|
||||
|
||||
meta_names = (
|
||||
'generator',
|
||||
'lang',
|
||||
'language',
|
||||
'description',
|
||||
'keywords',
|
||||
'author',
|
||||
'title',
|
||||
'subject',
|
||||
'revised',
|
||||
'abstract',
|
||||
'topic',
|
||||
'summary',
|
||||
'classfication',
|
||||
'category',
|
||||
'reply-to',
|
||||
'owner',
|
||||
'url',
|
||||
'identifier-URL',
|
||||
'geo.position',
|
||||
'geo.region',
|
||||
'geo.placename',
|
||||
'dc.language',
|
||||
)
|
||||
"""
|
||||
Values of the name attribute of meta tags to keep.
|
||||
|
||||
See also: https://gist.github.com/lancejpollard/1978404
|
||||
See also: https://github.com/joshbuchea/HEAD
|
||||
"""
|
||||
|
||||
|
||||
meta_props = (
|
||||
'og:site_name',
|
||||
'og:locale',
|
||||
'og:type',
|
||||
'og:latitude',
|
||||
'og:longitude',
|
||||
'og:street',
|
||||
'og:locality',
|
||||
'og:region',
|
||||
'og:postal',
|
||||
'og:country',
|
||||
)
|
||||
"""
|
||||
Values of the property attribute of meta tags to keep.
|
||||
"""
|
||||
|
||||
|
||||
link_rels = set(
|
||||
[
|
||||
'webmention',
|
||||
'pingback',
|
||||
'alternate',
|
||||
'canonical',
|
||||
'author',
|
||||
]
|
||||
)
|
||||
"""
|
||||
Values of the rel attribute of link tags to keep.
|
||||
"""
|
||||
|
||||
|
||||
def load_blacklist():
|
||||
"""
|
||||
Return the 10000 most popular internet domains.
|
||||
"""
|
||||
path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
|
||||
with open(path, 'r') as file:
|
||||
domains = file.read().strip().splitlines()
|
||||
return domains
|
||||
|
||||
|
||||
domain_blacklist = load_blacklist()
|
||||
|
||||
|
||||
def in_blacklist(hostname: str) -> Optional[str]:
|
||||
"""
|
||||
Return a match of host in the blacklist, or None.
|
||||
"""
|
||||
domain = extract_domain(hostname)
|
||||
if domain in domain_blacklist:
|
||||
return hostname
|
||||
return None
|
||||
|
||||
|
||||
def extract_domain(hostname: str) -> str:
|
||||
"""
|
||||
Extract the lower-case domain from a hostname.
|
||||
"""
|
||||
levels = tldextract.extract(hostname)
|
||||
return '.'.join(levels[-2:]).lower()
|
120
src/atextcrawler/utils/muse.py
Normal file
120
src/atextcrawler/utils/muse.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
"""
|
||||
Parse muse-formatted plaintext (delivered by amusewiki).
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from .date_finder import extract_latest_date
|
||||
from .lang import clean_lang
|
||||
|
||||
re_tag = re.compile(r'<[^<]+?>')
|
||||
|
||||
|
||||
def parse_muse(text: str) -> Optional[tuple[dict, str]]:
|
||||
"""
|
||||
Parse a MUSE string returning meta information and the text body.
|
||||
"""
|
||||
head, body = split_head_body(text)
|
||||
if not head:
|
||||
return None
|
||||
meta = parse_head(head)
|
||||
if not meta:
|
||||
return None
|
||||
return extract_muse_meta(meta, body), body
|
||||
|
||||
|
||||
def split_head_body(text: str) -> tuple[str, str]:
|
||||
"""
|
||||
Split a MUSE string into head and body and return both.
|
||||
"""
|
||||
head = ''
|
||||
while text.startswith('#'):
|
||||
line_end = text.find('\n') + 1
|
||||
head += text[:line_end]
|
||||
text = text[line_end:]
|
||||
return head.strip(), text.strip()
|
||||
|
||||
|
||||
def parse_head(text: str) -> dict:
|
||||
"""
|
||||
Parse a MUSE head and return a dict mapping field names to values.
|
||||
"""
|
||||
fields = {}
|
||||
for line in text.split('\n'):
|
||||
name, value = line.strip().split(' ', 1)
|
||||
fields[name[1:]] = value
|
||||
return fields
|
||||
|
||||
|
||||
amusewiki_fields = [
|
||||
'author',
|
||||
'title',
|
||||
'lang',
|
||||
'LISTtitle', # reduced title for alphabetical sorting
|
||||
'subtitle',
|
||||
'SORTauthors', # authors separated by ';' or ',' (only for indexing)
|
||||
'SORTtopics', # topics separated by ';' or ',' (only for indexing)
|
||||
'date', # publication year
|
||||
'pubdate', # publication datetime
|
||||
'notes', # additional info (orig title, translators, credits, ...)
|
||||
'source', # preferred format: "Retrieved on March 8, 2012 from {URL}"
|
||||
'publisher',
|
||||
'isbn',
|
||||
#'rights',
|
||||
'seriesname',
|
||||
'seriesnumber',
|
||||
#'hyphenation', # irrelevant
|
||||
#'slides', # irrelevant
|
||||
#'DELETED', # irrelevant
|
||||
#'cover', # irrelevant
|
||||
#'coverwidth', # irrelevant
|
||||
#'nocoverpage', # irrelevant
|
||||
#'notoc', # irrelevant
|
||||
#'nofinalpage', # irrelevant
|
||||
#'impressum', # irrelevant
|
||||
#'continuefootnotes', # irrelevant
|
||||
#'centerchapter', # irrelevant
|
||||
#'centersection', # irrelevant
|
||||
]
|
||||
"""
|
||||
Amusewiki fields are (cf. https://amusewiki.org/library/manual)
|
||||
"""
|
||||
|
||||
|
||||
re_list = re.compile('[;,]')
|
||||
|
||||
|
||||
def extract_muse_meta(meta, body) -> dict:
|
||||
"""
|
||||
Extract meta information from muse header and muse body.
|
||||
"""
|
||||
authors = set()
|
||||
if author := meta.get('author', '').strip():
|
||||
authors.add(author)
|
||||
if sortauthors := meta.get('SORTauthors', '').strip():
|
||||
for author in re_list.split(sortauthors):
|
||||
if author_ := author.strip():
|
||||
authors.add(author_)
|
||||
pubdate = meta.get('pubdate').strip()
|
||||
pub_date: Optional[datetime] = None
|
||||
if pubdate:
|
||||
try:
|
||||
pub_date = datetime.fromisoformat(pubdate)
|
||||
except:
|
||||
pub_date = extract_latest_date(pubdate)
|
||||
summary = re_tag.sub('', body[:1000].split('\n\n')[0])
|
||||
return {
|
||||
'title': re_tag.sub('', meta.get('title', '')) or None,
|
||||
'authors': authors,
|
||||
'lang': clean_lang(meta.get('lang')),
|
||||
'keywords': [
|
||||
s.strip()
|
||||
for s in re_list.split(meta.get('SORTtopics', '').strip())
|
||||
if s.strip()
|
||||
],
|
||||
'pub_date': pub_date,
|
||||
'summary': summary,
|
||||
'orig_source': meta.get('source', '').strip() or None,
|
||||
}
|
22
src/atextcrawler/utils/probe.py
Normal file
22
src/atextcrawler/utils/probe.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Utility functions for probing / sampling.
|
||||
"""
|
||||
|
||||
|
||||
def extract_samples(items, n=5):
|
||||
"""
|
||||
Extract up to n sample elements from the the given dict or list.
|
||||
|
||||
If *items* is a dict return the elements from the list of keys.
|
||||
"""
|
||||
l = len(items)
|
||||
if l <= n:
|
||||
return items
|
||||
poss = []
|
||||
step = (l + 1) / n
|
||||
for i in range(n):
|
||||
pos = int(step * i)
|
||||
if pos < l and (not poss or pos > poss[-1]):
|
||||
poss.append(pos)
|
||||
items_list = list(items)
|
||||
return [items_list[pos] for pos in poss]
|
74
src/atextcrawler/utils/section.py
Normal file
74
src/atextcrawler/utils/section.py
Normal file
|
@ -0,0 +1,74 @@
|
|||
"""
|
||||
Operations on text sections.
|
||||
|
||||
Semantic breaks are character positions within a text (0-offset)
|
||||
where a new section begins. More precisely, the character position
|
||||
contains a space and only at the next position begins a tag that is
|
||||
semantically breaking (e.g., a h1 or a br).
|
||||
|
||||
Each semantic break has a level, which means breaking strength.
|
||||
The lower the level (e.g., h1 has a lower level than h2), the
|
||||
stronger the break.
|
||||
|
||||
Implicitly, if position 0 has no semantic break, a semantic break
|
||||
at position 0 with level 80 is added.
|
||||
|
||||
Semantic breaks can be used to split a text into sections.
|
||||
The lower the maximum level of the semantic breaks taken into account,
|
||||
the coarser the segmentation and the fewer the sections.
|
||||
Each section is given the level of the semantic break at ist beginning.
|
||||
|
||||
From another point of view, sections have levels indicating
|
||||
the segmentation depth.
|
||||
|
||||
The levels for html tags are defined in tag.py.
|
||||
|
||||
The *semantic_breaks* argument in the functions below
|
||||
is a dict mapping the character position of the semantic break
|
||||
to the level of a section beginning at this position
|
||||
(if segmentation is done at this or a higher level).
|
||||
"""
|
||||
|
||||
|
||||
def iter_sections(text, semantic_breaks, max_level=59):
|
||||
"""
|
||||
Iterate over sections, limiting to those with a maximum level.
|
||||
|
||||
Yield (start_pos, end_pos, level, text).
|
||||
*text* is assumed to have the first semantic break at position 0.
|
||||
"""
|
||||
n = len(text)
|
||||
last_pos = 0
|
||||
last_level = semantic_breaks.get(0, 80)
|
||||
for pos, level in sorted(semantic_breaks.items()):
|
||||
if level <= max_level and last_pos != pos:
|
||||
yield last_pos, pos, last_level, text[last_pos + 1 : pos]
|
||||
last_pos = pos
|
||||
last_level = level
|
||||
if last_pos < n:
|
||||
yield last_pos, n, last_level, text[last_pos:]
|
||||
|
||||
|
||||
def concat_section_texts(text, semantic_breaks, min_len=2000):
|
||||
"""
|
||||
Try to concat consecutive sections into chunks with a minimum length.
|
||||
|
||||
Yield (section_ids, combined_text).
|
||||
"""
|
||||
n = len(text)
|
||||
last_pos = 0
|
||||
section_ids = []
|
||||
for section_id, pos in enumerate(semantic_breaks.keys()):
|
||||
if pos >= last_pos + min_len:
|
||||
if n - pos < min_len:
|
||||
for id_ in [
|
||||
i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
|
||||
]:
|
||||
section_ids.append(id_)
|
||||
pos = n
|
||||
yield section_ids, text[last_pos:pos]
|
||||
last_pos = pos
|
||||
section_ids = []
|
||||
section_ids.append(section_id)
|
||||
if last_pos < n:
|
||||
yield section_ids, text[last_pos:]
|
92
src/atextcrawler/utils/similarity.py
Normal file
92
src/atextcrawler/utils/similarity.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
"""
|
||||
Text similarity with simhash.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from asyncpg import Connection
|
||||
from simhash import Simhash, SimhashIndex
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.ERROR)
|
||||
|
||||
|
||||
postgresql_bigint_offset = 9223372036854775808
|
||||
"""
|
||||
Subtract this number to get a PostgreSQL bigint from a 64bit int.
|
||||
"""
|
||||
|
||||
|
||||
def get_features(txt: str) -> list[str]:
|
||||
"""
|
||||
Extract features from string for use with Simhash.
|
||||
"""
|
||||
width = 3
|
||||
txt = txt.replace(' ', '').lower()
|
||||
return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))]
|
||||
|
||||
|
||||
def simhash_to_bigint(simhash: Simhash) -> int:
|
||||
"""
|
||||
Convert a simhash to PostgreSQL's bigint value range.
|
||||
"""
|
||||
return simhash.value - postgresql_bigint_offset
|
||||
|
||||
|
||||
def simhash_from_bigint(bigint: int) -> Simhash:
|
||||
"""
|
||||
Convert a simhash from PostgreSQL's bigint to a Simhash instance.
|
||||
"""
|
||||
return Simhash(bigint + postgresql_bigint_offset, log=logger)
|
||||
|
||||
|
||||
def get_simhash(text: str) -> Simhash:
|
||||
"""
|
||||
Return the Simhash of the given text.
|
||||
"""
|
||||
return Simhash(get_features(text), log=logger)
|
||||
|
||||
|
||||
async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex:
|
||||
"""
|
||||
Return a simhash index with hashes of all stored resources of the site.
|
||||
"""
|
||||
sql = (
|
||||
"SELECT r.id, r.simhash FROM site_path sp, resource r"
|
||||
" WHERE sp.site_id=$1 AND sp.resource_id=r.id"
|
||||
)
|
||||
rows = await conn.fetch(sql, site_id)
|
||||
objs = [
|
||||
(
|
||||
str(row['id']),
|
||||
Simhash(row['simhash'] + postgresql_bigint_offset, log=logger),
|
||||
)
|
||||
for row in rows
|
||||
]
|
||||
return SimhashIndex(objs, k=3, log=logger)
|
||||
|
||||
|
||||
def create_simhash(
|
||||
index: SimhashIndex,
|
||||
resource_id: int,
|
||||
simhash_instance: Simhash,
|
||||
) -> int:
|
||||
"""
|
||||
Add a resource with given id and simhash to a simhash index.
|
||||
|
||||
Return the simhash value shifted into PostgreSQL's bigint range.
|
||||
|
||||
(The simhash field of the resource's database entry is not updated.)
|
||||
"""
|
||||
index.add(str(resource_id), simhash_instance)
|
||||
return simhash_to_bigint(simhash_instance)
|
||||
|
||||
|
||||
def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]:
|
||||
"""
|
||||
Return the ids of similar resources from the index.
|
||||
"""
|
||||
found = index.get_near_dups(simhash_inst)
|
||||
if found:
|
||||
return sorted([int(elem) for elem in found])
|
||||
return []
|
189
src/atextcrawler/utils/tag.py
Normal file
189
src/atextcrawler/utils/tag.py
Normal file
|
@ -0,0 +1,189 @@
|
|||
"""
|
||||
Information collections related to html tags.
|
||||
"""
|
||||
|
||||
|
||||
drop_tags = [
|
||||
'applet',
|
||||
'area',
|
||||
'audio',
|
||||
'base',
|
||||
'basefont',
|
||||
'bdi',
|
||||
'bdo',
|
||||
'button',
|
||||
'canvas',
|
||||
'code',
|
||||
'command',
|
||||
'data',
|
||||
'datalist',
|
||||
'dir',
|
||||
'embed',
|
||||
'fieldset',
|
||||
'figure',
|
||||
'form',
|
||||
'frame',
|
||||
'frameset',
|
||||
'iframe',
|
||||
'img',
|
||||
'input',
|
||||
'label',
|
||||
'legend',
|
||||
'map',
|
||||
'menuitem',
|
||||
'meter',
|
||||
'noframes',
|
||||
'noscript',
|
||||
'object',
|
||||
'optgroup',
|
||||
'option',
|
||||
'param',
|
||||
'picture',
|
||||
'progress',
|
||||
'rp',
|
||||
'rt',
|
||||
'ruby',
|
||||
'samp',
|
||||
'script',
|
||||
'select',
|
||||
'source',
|
||||
'style',
|
||||
'svg',
|
||||
'template',
|
||||
'textarea',
|
||||
'track',
|
||||
'var',
|
||||
'video',
|
||||
]
|
||||
"""
|
||||
Tags to drop, including their content.
|
||||
"""
|
||||
|
||||
|
||||
keep_tags = {
|
||||
'a': (0, 0, ''),
|
||||
'abbr': (0, 0, 'st'),
|
||||
'acronym': (0, 0, 'st'),
|
||||
'address': (1, 0, 'm'),
|
||||
'article': (1, 15, ''),
|
||||
'aside': (1, 0, 'd'),
|
||||
'b': (0, 0, 'st'),
|
||||
'blockquote': (1, 65, 'q'),
|
||||
'br': (1, 80, ''),
|
||||
'caption': (1, 68, ''),
|
||||
'center': (1, 50, ''),
|
||||
'cite': (1, 0, 'd'),
|
||||
'col': (1, 75, ''),
|
||||
'colgroup': (1, 73, ''),
|
||||
'dd': (1, 70, 'li'),
|
||||
'del': (0, 0, 'se'),
|
||||
'details': (1, 0, 'd'),
|
||||
'dfn': (0, 0, 'st'),
|
||||
'div': (1, 60, ''), # lvl often revised to min of contained tags
|
||||
'dl': (1, 70, 'l'),
|
||||
'dt': (1, 70, 'li'),
|
||||
'em': (0, 0, 'st'),
|
||||
'figcaption': (1, 0, ''),
|
||||
'font': (0, 0, 's'),
|
||||
'footer': (1, 15, ''),
|
||||
'h1': (1, 30, ''),
|
||||
'h2': (1, 32, ''),
|
||||
'h3': (1, 34, ''),
|
||||
'h4': (1, 36, ''),
|
||||
'h5': (1, 38, ''),
|
||||
'h6': (1, 40, ''),
|
||||
'header': (1, 15, ''),
|
||||
'hr': (1, 30, ''),
|
||||
'i': (0, 0, 'st'),
|
||||
'ins': (0, 0, 'se'),
|
||||
'li': (1, 75, 'li'), # lvl revised if not inside p
|
||||
'main': (1, 10, ''),
|
||||
'mark': (0, 0, 's'),
|
||||
'nav': (1, 0, ''), # keep for footnotes
|
||||
'ol': (1, 70, 'l'), # lvl revised if not inside p
|
||||
'p': (1, 60, ''),
|
||||
'pre': (1, 65, 'q'),
|
||||
'q': (1, 0, 'q'),
|
||||
's': (0, 0, ''),
|
||||
'section': (1, 24, ''),
|
||||
'small': (0, 0, 'd'),
|
||||
'span': (0, 0, 's'),
|
||||
'strike': (0, 0, 'se'),
|
||||
'strong': (0, 0, 'st'),
|
||||
'sub': (0, 0, ''),
|
||||
'summary': (1, 20, 'm'),
|
||||
'sup': (0, 0, ''),
|
||||
'table': (1, 65, ''),
|
||||
'tbody': (1, 70, ''),
|
||||
'td': (1, 78, ''),
|
||||
'tfoot': (1, 70, ''),
|
||||
'th': (1, 75, ''),
|
||||
'thead': (1, 70, ''),
|
||||
'time': (0, 0, 'm'),
|
||||
'tr': (1, 75, ''),
|
||||
'u': (0, 0, 's'),
|
||||
'ul': (1, 70, 'l'), # lvl revised if not inside p
|
||||
}
|
||||
"""
|
||||
Tags to keep for annotation, and their properties.
|
||||
|
||||
The properties are:
|
||||
|
||||
* sep: whether to separate text at both sides of the tag with a space
|
||||
* lvl: structural depth level of content of this tag;
|
||||
the paragraph level is 60; headings are below 60, listings above;
|
||||
a div below the tag will usually have the tag's depth + 1
|
||||
* sem: semantic categories: zero or more of
|
||||
* s=span
|
||||
* l=listing
|
||||
* i=list_item
|
||||
* t=term
|
||||
* e=edit
|
||||
* d=details
|
||||
* q=quote
|
||||
* m=meta
|
||||
* x=exclude
|
||||
"""
|
||||
|
||||
|
||||
self_closing_tags = ('br', 'hr')
|
||||
"""
|
||||
Those among keep_tags which are self-closing.
|
||||
"""
|
||||
|
||||
|
||||
all_self_closing_tags = (
|
||||
'area',
|
||||
'base',
|
||||
'br',
|
||||
'col',
|
||||
'embed',
|
||||
'hr',
|
||||
'img',
|
||||
'input',
|
||||
'link',
|
||||
'meta',
|
||||
'param',
|
||||
'source',
|
||||
'track',
|
||||
'wbr',
|
||||
)
|
||||
"""
|
||||
All self-closing tags of the html standard.
|
||||
"""
|
||||
|
||||
|
||||
drop_roles = (
|
||||
'banner',
|
||||
'complementary',
|
||||
'contentinfo',
|
||||
'dialog',
|
||||
'figure',
|
||||
'form',
|
||||
'img',
|
||||
'search',
|
||||
'switch',
|
||||
)
|
||||
"""
|
||||
Drop tags with these aria roles.
|
||||
"""
|
7
tests/__init__.py
Normal file
7
tests/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
from .annotation import AnnotateTest
|
||||
from .date_finder import DateFinderTest
|
||||
from .page import PageCleanTest
|
||||
from .section import IterSectionTest, AggSectionTest
|
||||
from .simhash import SimhashTest
|
||||
from .text import CleanHtmlTest
|
||||
from .durl import DurlTest
|
49
tests/annotation.py
Normal file
49
tests/annotation.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
"""
|
||||
Test cases for resource type page.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
|
||||
from atextcrawler.utils.annotation import annotate
|
||||
|
||||
|
||||
class AnnotateTest(TestCase):
|
||||
"""
|
||||
Test annotation.
|
||||
|
||||
Consider that the <br> and <hr> tags are self-closing.
|
||||
"""
|
||||
|
||||
def test_annotate_1(self):
|
||||
s = '<em>Hello</em><br><strong>world</strong>'
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
|
||||
self.assertEqual(anns['section_ids'], {})
|
||||
|
||||
def test_annotate_2(self):
|
||||
s = '<em> Hello </em><br><strong> world </strong>'
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
|
||||
self.assertEqual(anns['section_ids'], {})
|
||||
|
||||
def test_annotate_3(self):
|
||||
s = '<p> Hello <em>world</em> </p> '
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 60})
|
||||
|
||||
def test_annotate_4(self):
|
||||
s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 60})
|
||||
self.assertEqual(anns['section_ids'], {0: ['ref1']})
|
||||
|
||||
def test_annotate_5(self):
|
||||
s = '<br id="ref2"> Hello <p>world </p> '
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
|
||||
self.assertEqual(anns['section_ids'], {1: ['ref2']})
|
20
tests/date_finder.py
Normal file
20
tests/date_finder.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
from datetime import datetime
|
||||
from unittest import TestCase
|
||||
|
||||
from atextcrawler.utils.date_finder import extract_latest_date
|
||||
|
||||
|
||||
class DateFinderTest(TestCase):
|
||||
def test_extract_latest_date(self):
|
||||
s = 'test 1987-2+1-no'
|
||||
r = datetime(1987, 2, 1)
|
||||
self.assertEqual(extract_latest_date(s), r)
|
||||
s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
|
||||
r = datetime(2020, 4, 6)
|
||||
self.assertEqual(extract_latest_date(s, lang='de'), r)
|
||||
s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
|
||||
r = datetime(2021, 1, 20)
|
||||
self.assertEqual(extract_latest_date(s, lang='en'), r)
|
||||
s = ''
|
||||
r = None
|
||||
self.assertEqual(extract_latest_date(s), r)
|
68
tests/durl.py
Normal file
68
tests/durl.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
from unittest import IsolatedAsyncioTestCase
|
||||
import asyncpg
|
||||
from atextcrawler.utils.durl import Durl
|
||||
from atextcrawler.config import Config
|
||||
from atextcrawler.db import PGPool
|
||||
|
||||
|
||||
class DurlTest(IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
config = Config().get()
|
||||
self.pool = PGPool(config['postgresql'])
|
||||
await self.pool.__aenter__()
|
||||
self.conn = await self.pool.pool.acquire()
|
||||
|
||||
async def test_durl_basic(self):
|
||||
durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
|
||||
self.assertEqual(durl1.scheme, 'https')
|
||||
self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
|
||||
self.assertEqual(durl1.port, 8000)
|
||||
self.assertEqual(durl1.path, '/hello')
|
||||
self.assertEqual(durl1.fragment, '')
|
||||
self.assertEqual(durl1.pwa(), 'hello?world')
|
||||
self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
|
||||
self.assertEqual(
|
||||
durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
|
||||
)
|
||||
self.assertEqual(durl1.has_path(), True)
|
||||
durl2 = await Durl('http://www.example.com/')
|
||||
self.assertEqual(durl2.has_path(), False)
|
||||
durl3 = await Durl('ftp://www.example.com/')
|
||||
self.assertEqual(durl3, None)
|
||||
|
||||
async def test_durl_with_base(self):
|
||||
durl1 = await Durl('https://www.example.com')
|
||||
self.assertEqual(durl1.path, '/')
|
||||
self.assertEqual(durl1.pwa(), '')
|
||||
self.assertEqual(durl1.has_path(), False)
|
||||
durl2 = await Durl('https://www.example.com/hello2', base=durl1)
|
||||
self.assertEqual(durl2.hostname, 'www.example.com')
|
||||
self.assertEqual(durl2.path, '/hello2')
|
||||
self.assertEqual(durl2.pwa(), 'hello2')
|
||||
durl3 = await Durl('/hello3?x=1', base=durl1)
|
||||
self.assertEqual(durl3.hostname, 'www.example.com')
|
||||
self.assertEqual(durl3.path, '/hello3')
|
||||
self.assertEqual(durl3.pwa(), 'hello3?x=1')
|
||||
self.assertEqual(durl3.site(), 'https://www.example.com/')
|
||||
durl4 = await Durl('https://www.kernel.org/', base=durl1)
|
||||
self.assertEqual(durl4, None)
|
||||
|
||||
async def test_durl_with_base_and_match_base(self):
|
||||
durl1 = await Durl('https://www.example.com/base/path/')
|
||||
self.assertEqual(durl1.path, '/base/path/')
|
||||
self.assertEqual(durl1.pwa(), 'base/path/')
|
||||
self.assertEqual(durl1.has_path(), True)
|
||||
durl2 = await Durl(
|
||||
'https://www.example.com/base/', base=durl1, match_base=True
|
||||
)
|
||||
self.assertEqual(durl2, None)
|
||||
durl3 = await Durl(
|
||||
'https://www.example.com/base/path/whatever?x=1#a',
|
||||
base=durl1,
|
||||
match_base=True,
|
||||
)
|
||||
self.assertEqual(durl3.pwa(), 'whatever?x=1')
|
||||
|
||||
async def asyncTearDown(self):
|
||||
await self.pool.pool.release(self.conn)
|
||||
await self.pool.pool.close()
|
24
tests/page.py
Normal file
24
tests/page.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Test cases for resource type page.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
from atextcrawler.utils.html import clean_body
|
||||
|
||||
# from atextcrawler.utils.tag import drop_tags
|
||||
|
||||
|
||||
class PageCleanTest(TestCase):
|
||||
def test_clean_body_1(self):
|
||||
s = ' <em>Hello</em> <strong>world</strong> '
|
||||
r = '<em>Hello</em> <strong>world</strong>'
|
||||
self.assertEqual(clean_body(s), r)
|
||||
|
||||
|
||||
# def test_drop_tags(self):
|
||||
# s = '<figure what="ever">something<figure>else</figure>...</figure>'
|
||||
# r = drop_tags(s)
|
||||
# self.assertEqual(r, '')
|
||||
# s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
|
||||
# r = drop_tags(s)
|
||||
# self.assertEqual(r, '')
|
105
tests/section.py
Normal file
105
tests/section.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
from unittest import TestCase
|
||||
|
||||
from atextcrawler.utils.section import concat_section_texts, iter_sections
|
||||
|
||||
|
||||
class IterSectionTest(TestCase):
|
||||
def test_iter_sections_1(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 80, 5: 2, 15: 1, 20: 3}
|
||||
sections1 = list(iter_sections(s, sb, max_level=100))
|
||||
sections2 = [
|
||||
(0, 5, 80, 'bcde'),
|
||||
(5, 15, 2, 'ghijklmno'),
|
||||
(15, 20, 1, 'qrst'),
|
||||
(20, 26, 3, 'uvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_iter_sections_2(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
|
||||
sections1 = list(iter_sections(s, sb, max_level=100))
|
||||
sections2 = [
|
||||
(0, 5, 4, 'bcde'),
|
||||
(5, 15, 2, 'ghijklmno'),
|
||||
(15, 20, 1, 'qrst'),
|
||||
(20, 26, 3, 'vwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_iter_sections_3(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {5: 2, 15: 60, 18: 50, 20: 3}
|
||||
sections1 = list(iter_sections(s, sb, max_level=59))
|
||||
sections2 = [
|
||||
(0, 5, 80, 'bcde'),
|
||||
(5, 18, 2, 'ghijklmnopqr'),
|
||||
(18, 20, 50, 't'),
|
||||
(20, 26, 3, 'uvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_iter_sections_4(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
|
||||
sections1 = list(iter_sections(s, sb, max_level=59))
|
||||
sections2 = [
|
||||
(0, 5, 80, 'bcde'),
|
||||
(5, 18, 2, 'ghijklmnopqr'),
|
||||
(18, 20, 50, 't'),
|
||||
(20, 26, 3, 'uvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
|
||||
class AggSectionTest(TestCase):
|
||||
def test_concat_sections_1(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 5: 1, 15: 1, 20: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghijklmno'),
|
||||
([2, 3], 'pqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_2(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghij'),
|
||||
([2, 3, 4], 'klmnopqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_3(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1, 2], 'abcdefghijklmnop'),
|
||||
([3, 4], 'qrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_4(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 5: 1, 15: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghijklmno'),
|
||||
([2, 3], 'pqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_5(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghijkl'),
|
||||
([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
54
tests/simhash.py
Normal file
54
tests/simhash.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
"""
|
||||
Test cases for text util.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
from simhash import Simhash, SimhashIndex
|
||||
from atextcrawler.utils.similarity import (
|
||||
create_simhash,
|
||||
get_features,
|
||||
get_simhash,
|
||||
postgresql_bigint_offset,
|
||||
search_simhash,
|
||||
)
|
||||
|
||||
|
||||
class SimhashTest(TestCase):
|
||||
"""
|
||||
Test simhash creation and search.
|
||||
"""
|
||||
|
||||
def test_search(self):
|
||||
n1 = int('1111111100000000', 2)
|
||||
n2 = int('1111111100000111', 2)
|
||||
n3 = int('1000000000000000', 2)
|
||||
n4 = int('1000000000000111', 2)
|
||||
n5 = int('1000001111000000', 2)
|
||||
objs = [
|
||||
('1', Simhash(n1)),
|
||||
('3', Simhash(n3)),
|
||||
('4', Simhash(n4)),
|
||||
]
|
||||
index = SimhashIndex(objs, k=3)
|
||||
found = search_simhash(index, Simhash(n5))
|
||||
self.assertEqual(found, [])
|
||||
found = search_simhash(index, Simhash(n1))
|
||||
self.assertEqual(found, [1])
|
||||
found = search_simhash(index, Simhash(n2))
|
||||
self.assertEqual(found, [1])
|
||||
found = search_simhash(index, Simhash(n4))
|
||||
self.assertEqual(found, [3, 4])
|
||||
|
||||
def test_create(self):
|
||||
index = SimhashIndex([], k=3)
|
||||
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
|
||||
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
|
||||
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
|
||||
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
|
||||
found = search_simhash(index, simhash_1)
|
||||
self.assertEqual(found, [101])
|
||||
found = search_simhash(index, simhash_2)
|
||||
self.assertEqual(found, [102])
|
||||
simhash_3 = get_simhash('hello ' * 20 + 'X')
|
||||
found = search_simhash(index, simhash_3)
|
||||
self.assertEqual(found, [101])
|
65
tests/text.py
Normal file
65
tests/text.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
"""
|
||||
Test cases for text util.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
from atextcrawler.utils.html import clean_page
|
||||
|
||||
|
||||
class CleanHtmlTest(TestCase):
|
||||
"""
|
||||
Test clean_page.
|
||||
|
||||
Have an eye on self-closing tags (br, hr, ...).
|
||||
"""
|
||||
|
||||
def test_clean_page_1(self):
|
||||
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
|
||||
r = '<em>Hello</em><br/>anything'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_2(self):
|
||||
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
|
||||
r = '<em>Hello</em><br/>anything'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_3(self):
|
||||
# nesting
|
||||
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
|
||||
r = '--..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_4(self):
|
||||
# aria-hidden
|
||||
s = '--<p aria-hidden=true>xx</p>..'
|
||||
r = '--..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden="true">xx</p>..'
|
||||
r = '--..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden=false>xx</p>..'
|
||||
r = '--<p aria-hidden="false">xx</p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden="false">xx</p>..'
|
||||
r = '--<p aria-hidden="false">xx</p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden=??>xx</p>..'
|
||||
r = '--<p aria-hidden="??">xx</p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_5(self):
|
||||
# no removal
|
||||
s = '--<p>xx<em>yy</em></p>..'
|
||||
r = '--<p>xx<em>yy</em></p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_6(self):
|
||||
# self-closing tags to be removed
|
||||
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
|
||||
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_7(self):
|
||||
s = '--<p rel=search>tt<area /></p>nn'
|
||||
r = '--nn'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
Loading…
Reference in a new issue