Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
51
.gitignore
vendored
Normal file
51
.gitignore
vendored
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# Backup files
|
||||||
|
*.~
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
bin/
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
NOTES
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
htmlcov
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
|
||||||
|
# mypy cache
|
||||||
|
.mypy_cache
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
doc/build/
|
||||||
|
doc/source/reference/
|
||||||
|
|
||||||
|
# tmp dir
|
||||||
|
tmp/
|
30
.pre-commit-config.yaml
Normal file
30
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
# See https://pre-commit.com for more information
|
||||||
|
# See https://pre-commit.com/hooks.html for more hooks
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v4.0.1
|
||||||
|
hooks:
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: check-yaml
|
||||||
|
- id: check-added-large-files
|
||||||
|
- repo: https://github.com/psf/black
|
||||||
|
rev: 21.11b1
|
||||||
|
hooks:
|
||||||
|
- id: black
|
||||||
|
- repo: https://github.com/timothycrosley/isort
|
||||||
|
rev: 5.10.1
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
args: ["--profile", "black", "--filter-files", "-l", "79"]
|
||||||
|
- repo: https://github.com/myint/autoflake
|
||||||
|
rev: v1.4
|
||||||
|
hooks:
|
||||||
|
- id: autoflake
|
||||||
|
args:
|
||||||
|
[
|
||||||
|
"--in-place",
|
||||||
|
"--remove-all-unused-imports",
|
||||||
|
"--ignore-init-module-imports",
|
||||||
|
"--remove-unused-variables",
|
||||||
|
]
|
46
Pipfile
Normal file
46
Pipfile
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
[[source]]
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
name = "pypi"
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
aiohttp = "*"
|
||||||
|
async-lru = "*"
|
||||||
|
asyncpg = "*"
|
||||||
|
beautifulsoup4 = "*"
|
||||||
|
elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] }
|
||||||
|
elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" }
|
||||||
|
feedparser = "*"
|
||||||
|
gcld3 = "*"
|
||||||
|
# TODO: recheck
|
||||||
|
pypandoc = "*"
|
||||||
|
pytidylib = "*"
|
||||||
|
pytz = "*"
|
||||||
|
pyyaml = "*"
|
||||||
|
tika = "*"
|
||||||
|
tldextract = "*"
|
||||||
|
voluptuous = "*"
|
||||||
|
simhash = "*"
|
||||||
|
async-dns = "*"
|
||||||
|
types-pyyaml = "*"
|
||||||
|
sphinx-rtd-theme = "*"
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
mypy = "*"
|
||||||
|
pre-commit = "*"
|
||||||
|
sphinx = "*"
|
||||||
|
myst-parser = "*"
|
||||||
|
isort = "*"
|
||||||
|
blacken-docs = "*"
|
||||||
|
pybetter = "*"
|
||||||
|
interrogate = "*"
|
||||||
|
autoflake = "*"
|
||||||
|
types-pyyaml = "*"
|
||||||
|
types-pytz = "*"
|
||||||
|
black = "*"
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.9"
|
||||||
|
|
||||||
|
[pipenv]
|
||||||
|
allow_prereleases = true
|
1561
Pipfile.lock
generated
Normal file
1561
Pipfile.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
13
README.md
Normal file
13
README.md
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search.
|
||||||
|
|
||||||
|
Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch)
|
||||||
|
|
||||||
|
atextcrawler crawls and indexes selected websites.
|
||||||
|
It starts from a few seed sites and follows their external links.
|
||||||
|
Criteria defined in plugin code determine which linked sites (and
|
||||||
|
which of their resources) are (recursively) added to the pool.
|
||||||
|
|
||||||
|
atextcrawler is written in Python, runs a configurable number of
|
||||||
|
async workers concurrently (in one process), uses tensorflow for
|
||||||
|
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||||
|
and stores metadata in PostgreSQL and texts in elasticsearch.
|
20
doc/Makefile
Normal file
20
doc/Makefile
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = source
|
||||||
|
BUILDDIR = build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
71
doc/source/conf.py
Normal file
71
doc/source/conf.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
# Configuration file for the Sphinx documentation builder.
|
||||||
|
#
|
||||||
|
# This file only contains a selection of the most common options. For a full
|
||||||
|
# list see the documentation:
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
# -- Path setup --------------------------------------------------------------
|
||||||
|
|
||||||
|
# If extensions (or modules to document with autodoc) are in another directory,
|
||||||
|
# add these directories to sys.path here. If the directory is relative to the
|
||||||
|
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||||
|
#
|
||||||
|
# import os
|
||||||
|
# import sys
|
||||||
|
# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
|
||||||
|
sys.path.insert(0, proj_dir + '/src')
|
||||||
|
|
||||||
|
|
||||||
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
|
project = 'atextcrawler'
|
||||||
|
copyright = '2021, ibu radempa'
|
||||||
|
author = 'ibu radempa'
|
||||||
|
|
||||||
|
# The full version, including alpha/beta/rc tags
|
||||||
|
release = '0.1.0'
|
||||||
|
|
||||||
|
|
||||||
|
# -- General configuration ---------------------------------------------------
|
||||||
|
|
||||||
|
# Add any Sphinx extension module names here, as strings. They can be
|
||||||
|
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||||
|
# ones.
|
||||||
|
extensions = [
|
||||||
|
'sphinx.ext.autodoc',
|
||||||
|
'sphinx.ext.autosummary',
|
||||||
|
'myst_parser',
|
||||||
|
'sphinx.ext.graphviz',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
|
templates_path = ['_templates']
|
||||||
|
|
||||||
|
# List of patterns, relative to source directory, that match files and
|
||||||
|
# directories to ignore when looking for source files.
|
||||||
|
# This pattern also affects html_static_path and html_extra_path.
|
||||||
|
exclude_patterns = []
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
|
||||||
|
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||||
|
# a list of builtin themes.
|
||||||
|
#
|
||||||
|
html_theme = 'sphinx_rtd_theme'
|
||||||
|
|
||||||
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
|
html_static_path = ['_static']
|
||||||
|
|
||||||
|
autosummary_generate = True
|
||||||
|
|
||||||
|
source_suffix = {
|
||||||
|
'.rst': 'restructuredtext',
|
||||||
|
'.md': 'markdown',
|
||||||
|
}
|
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# Initial URLs (first run only)
|
||||||
|
#
|
||||||
|
# To whitelist a URL prepend '+', to blacklist prepend '-'.
|
||||||
|
# Comment lines must begin with '#'.
|
||||||
|
|
||||||
|
# de
|
||||||
|
+http://agd.blogsport.de/
|
||||||
|
+https://blackblogs.org/blogs/
|
||||||
|
+https://fau.org/
|
||||||
|
+http://anarchiv.de/
|
||||||
|
+http://olaf.bbm.de/die-aktion
|
||||||
|
-https://www.anarchistischefoderation.de/
|
||||||
|
|
||||||
|
# en
|
||||||
|
+https://anarchistarchivist.com/
|
||||||
|
+https://bookshelf.theanarchistlibrary.org/library/
|
||||||
|
+https://archive.elephanteditions.net/library/
|
||||||
|
+https://blackrosefed.org/
|
||||||
|
+https://alpineanarchist.org/
|
||||||
|
+https://nostate.net/
|
||||||
|
+https://abolishing.blackblogs.org/
|
||||||
|
+http://library.nothingness.org/
|
||||||
|
-https://www.anarchistfederation.net/
|
88
doc/source/config_template/main.yaml
Normal file
88
doc/source/config_template/main.yaml
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
# Name of this instance
|
||||||
|
# Default value: atextcrawler
|
||||||
|
# Allowed values: arbitrary string
|
||||||
|
instance_name: atextcrawler
|
||||||
|
|
||||||
|
# Which kind of instance is this?
|
||||||
|
# Default value: prod
|
||||||
|
# Allowed values are:
|
||||||
|
# - 'dev': development instance
|
||||||
|
# - 'staging': staging instance
|
||||||
|
# - 'prod': production instance
|
||||||
|
instance_type: prod
|
||||||
|
|
||||||
|
# Log level
|
||||||
|
# Default value: info
|
||||||
|
# Allowed values: critical, error, warning, info, debug
|
||||||
|
log_level: info
|
||||||
|
|
||||||
|
# Plugins directory
|
||||||
|
# If given as relative path, it will be relative to the
|
||||||
|
# directory of this file (main.yaml).
|
||||||
|
# Read documentation on plugins.
|
||||||
|
# Default value: plugins
|
||||||
|
# Hint: Create a empty __init__.py in the plugins_dir.
|
||||||
|
plugins_dir: plugins
|
||||||
|
|
||||||
|
# Parameters for access to the PostgreSQL service
|
||||||
|
# No default values; must be set.
|
||||||
|
postgresql:
|
||||||
|
host: localhost
|
||||||
|
port: 5432
|
||||||
|
database: atextcrawler
|
||||||
|
user: atextcrawler
|
||||||
|
password: ________________________
|
||||||
|
|
||||||
|
# Crawling
|
||||||
|
crawl:
|
||||||
|
# Number of concurrent workers
|
||||||
|
# Default value: 10
|
||||||
|
# Allowed values: integer >=0 and <=1000
|
||||||
|
#workers: 3
|
||||||
|
|
||||||
|
# Delay in seconds between attempts to fetch items
|
||||||
|
# from site_queue if the last attempt gave no item
|
||||||
|
# Also the delay in seconds after a worker has found
|
||||||
|
# no site to process
|
||||||
|
# Default value: 600
|
||||||
|
# Allowed values: positive number
|
||||||
|
#site_delay: 10
|
||||||
|
|
||||||
|
# Time interval in seconds between site updates when
|
||||||
|
# handling queued base URLs
|
||||||
|
# Default value: 3600
|
||||||
|
# Allowed values: positive number
|
||||||
|
#site_revisit_interval: 3600
|
||||||
|
|
||||||
|
# Delay in seconds between attempts to process
|
||||||
|
# individual resources (pages etc.) of a site
|
||||||
|
# Default value: 5
|
||||||
|
# Allowed values: positive number
|
||||||
|
#resource_delay: 3
|
||||||
|
|
||||||
|
# Default interval in seconds between full crawls of a site
|
||||||
|
# Default value: 864000 (10 days)
|
||||||
|
# Allowed values: positive number
|
||||||
|
#full_crawl_interval: 864000
|
||||||
|
|
||||||
|
# Default interval in seconds between feed crawls of a site
|
||||||
|
# Default value: 86400 (1 day)
|
||||||
|
# Allowed values: positive number
|
||||||
|
#feed_crawl_interval: 86400
|
||||||
|
|
||||||
|
# Parameters for access to the ElasticSearch service
|
||||||
|
# No default values; must be set.
|
||||||
|
elasticsearch:
|
||||||
|
# host on which ES is running
|
||||||
|
host: localhost
|
||||||
|
# API key for accessing ES
|
||||||
|
api_key: "**********************"
|
||||||
|
# API user id
|
||||||
|
id: "**********************"
|
||||||
|
# Index base name (actual index names will have '_text' etc. appended)
|
||||||
|
index_base_name: atext
|
||||||
|
|
||||||
|
# Tensorflow access
|
||||||
|
tensorflow:
|
||||||
|
# The prediction endpoint of the model server's sentence model
|
||||||
|
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|
0
doc/source/config_template/plugins/__init__.py
Normal file
0
doc/source/config_template/plugins/__init__.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
Filter paths found in a resource.
|
||||||
|
|
||||||
|
This plugin implements :func:`rp_filter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
def rp_filter(site, durl) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Adjust or filter found paths (may depend on site).
|
||||||
|
|
||||||
|
To filter out a path (i.e., not add it to table `site_path`)
|
||||||
|
return None.
|
||||||
|
"""
|
||||||
|
path = durl.pwa()
|
||||||
|
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||||
|
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||||
|
return None
|
||||||
|
path = path.removesuffix('?amp=1')
|
||||||
|
return path
|
47
doc/source/config_template/plugins/filter_site.py
Normal file
47
doc/source/config_template/plugins/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
"""
|
||||||
|
Relevance estimation of sites.
|
||||||
|
|
||||||
|
This plugin implements :func:`site_filter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from atextcrawler.models import Site
|
||||||
|
|
||||||
|
MIN_RELEVANCE_SCORE = 5
|
||||||
|
|
||||||
|
|
||||||
|
async def site_filter(site: Site) -> bool:
|
||||||
|
"""
|
||||||
|
Assess relevance of the site (using language-dependent criteria).
|
||||||
|
|
||||||
|
If the site shall be crawled, return True, else False.
|
||||||
|
"""
|
||||||
|
# limit to sites in English or German language
|
||||||
|
if not set(['de', 'en']) & set(site.langs):
|
||||||
|
return False
|
||||||
|
score = 0.0
|
||||||
|
for crit_name, weight, langs, crit_re in re_criteria:
|
||||||
|
if '*' in langs or set(langs) & set(site.langs):
|
||||||
|
findings = crit_re.findall(site.startpage_text)
|
||||||
|
if findings:
|
||||||
|
score += weight * len(findings)
|
||||||
|
if site.title and crit_re.search(site.title):
|
||||||
|
score += 4 * weight
|
||||||
|
if site.description and crit_re.search(site.description):
|
||||||
|
score += 4 * weight
|
||||||
|
|
||||||
|
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||||
|
|
||||||
|
return score >= MIN_RELEVANCE_SCORE
|
||||||
|
|
||||||
|
|
||||||
|
re_criteria = {
|
||||||
|
(
|
||||||
|
'anarch',
|
||||||
|
1.0,
|
||||||
|
('*',),
|
||||||
|
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||||
|
),
|
||||||
|
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||||
|
}
|
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
"""
|
||||||
|
Plugin for filtering paths of a site to be retrieved.
|
||||||
|
|
||||||
|
This plugin implements :func:`sp_filter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def sp_filter(site, path, robots) -> bool:
|
||||||
|
"""
|
||||||
|
Per-site path filter. Return whether the path shall be retrieved.
|
||||||
|
"""
|
||||||
|
if not robots.can_fetch_url(site.base_url + path):
|
||||||
|
return False
|
||||||
|
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||||
|
if any(
|
||||||
|
[
|
||||||
|
path.endswith(end)
|
||||||
|
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||||
|
]
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
if '/bbselect?' in path:
|
||||||
|
return False
|
||||||
|
return True
|
63
doc/source/devel/devel.md
Normal file
63
doc/source/devel/devel.md
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
## Setup dev environment
|
||||||
|
1. You need python 3.9 or later.
|
||||||
|
1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
|
||||||
|
1. Clone the repo and setup a virtualenv:
|
||||||
|
```
|
||||||
|
cd YOUR_DEV_DIR
|
||||||
|
git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
|
||||||
|
cd atextcrawler
|
||||||
|
pipenv install -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configure the instance
|
||||||
|
See [installation](installation.md).
|
||||||
|
|
||||||
|
## Run
|
||||||
|
```
|
||||||
|
python -m atextcrawler
|
||||||
|
```
|
||||||
|
|
||||||
|
## Logging
|
||||||
|
Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
|
||||||
|
```
|
||||||
|
journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
|
||||||
|
```
|
||||||
|
|
||||||
|
## Upgrading
|
||||||
|
Upgrade dev tools:
|
||||||
|
```
|
||||||
|
pre-commit autoupdate
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test and clean manually
|
||||||
|
```
|
||||||
|
AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
|
||||||
|
mypy --ignore-missing-imports src/atextcrawler
|
||||||
|
isort src/atextcrawler
|
||||||
|
black -S -t py37 -l 79 src/atextcrawler
|
||||||
|
pybetter --exclude B004,B007,B008 src/atextcrawler
|
||||||
|
interrogate -i -I -m -v src/atextcrawler
|
||||||
|
```
|
||||||
|
|
||||||
|
## Release
|
||||||
|
There are no releases (currently).
|
||||||
|
|
||||||
|
## Useful commands
|
||||||
|
|
||||||
|
### Fetch a resource or a site manually
|
||||||
|
```
|
||||||
|
python -m atextcrawler.resource https://www.katesharpleylibrary.net/
|
||||||
|
python -m atextcrawler.site https://www.katesharpleylibrary.net/
|
||||||
|
```
|
||||||
|
|
||||||
|
### SQL
|
||||||
|
```
|
||||||
|
drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
|
||||||
|
|
||||||
|
http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
|
||||||
|
|
||||||
|
http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
|
||||||
|
|
||||||
|
-- stats: sites, paths, resources
|
||||||
|
select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
|
||||||
|
```
|
64
doc/source/devel/related_work.md
Normal file
64
doc/source/devel/related_work.md
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
## Related work
|
||||||
|
* [collection of crawlers](https://github.com/adbar/awesome-crawler)
|
||||||
|
* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
|
||||||
|
|
||||||
|
### crawlers
|
||||||
|
* [acrawler](https://acrawler.readthedocs.io/en/latest/)
|
||||||
|
* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
|
||||||
|
* [repo](https://github.com/adbar/trafilatura)
|
||||||
|
* [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
|
||||||
|
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
|
||||||
|
* [scrapy](https://docs.scrapy.org/en/latest/)
|
||||||
|
* [heritrix3](https://github.com/internetarchive/heritrix3/)
|
||||||
|
* [YaCy](https://yacy.net/)
|
||||||
|
* [searchmysite](https://searchmysite.net/)
|
||||||
|
* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
|
||||||
|
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
|
||||||
|
* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
|
||||||
|
* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
|
||||||
|
|
||||||
|
#### general
|
||||||
|
* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
|
||||||
|
|
||||||
|
### sitemap parsers
|
||||||
|
* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
|
||||||
|
|
||||||
|
### url handling
|
||||||
|
* [courlan](https://pypi.org/project/courlan/)
|
||||||
|
|
||||||
|
### language detection
|
||||||
|
* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
|
||||||
|
* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
|
||||||
|
* [guess_language](https://pypi.org/project/guess-language/)
|
||||||
|
* [cld3](https://github.com/google/cld3)
|
||||||
|
|
||||||
|
### text extraction
|
||||||
|
* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
|
||||||
|
|
||||||
|
### deduplication
|
||||||
|
* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
|
||||||
|
* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
|
||||||
|
* remove paragraphs with more than 50% word-7-tuples encountered previously
|
||||||
|
|
||||||
|
### Extract more meta tags
|
||||||
|
* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
|
||||||
|
https://support.shareaholic.com/hc/en-us/articles/115003085186
|
||||||
|
|
||||||
|
### Date parsing dependent on language
|
||||||
|
* https://en.wikipedia.org/wiki/Date_format_by_country
|
||||||
|
* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
|
||||||
|
* https://pypi.org/project/dateparser/
|
||||||
|
* https://github.com/ovalhub/pyicu
|
||||||
|
* https://github.com/night-crawler/cldr-language-helpers
|
||||||
|
* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
|
||||||
|
|
||||||
|
ICU
|
||||||
|
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
|
||||||
|
* https://gist.github.com/dpk/8325992
|
||||||
|
* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
|
||||||
|
* https://unicode-org.github.io/icu/userguide/
|
||||||
|
* https://unicode-org.github.io/icu-docs/#/icu4c/
|
||||||
|
* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
|
||||||
|
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
|
||||||
|
* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
|
||||||
|
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
|
77
doc/source/devel/todo.md
Normal file
77
doc/source/devel/todo.md
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
* parse html time tags
|
||||||
|
|
||||||
|
* site annotations:
|
||||||
|
* categories
|
||||||
|
* historical (no changes any more since n months)
|
||||||
|
* news
|
||||||
|
* local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
|
||||||
|
|
||||||
|
* allow for tls in elasticsearch config
|
||||||
|
|
||||||
|
* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
|
||||||
|
```
|
||||||
|
'–': '--',
|
||||||
|
'–': '--',
|
||||||
|
'–': '--',
|
||||||
|
'—': '---',
|
||||||
|
'—': '---',
|
||||||
|
'—': '---',
|
||||||
|
'…': '...',
|
||||||
|
'…': '...',
|
||||||
|
'…': '...',
|
||||||
|
'“': '"',
|
||||||
|
'”': '"',
|
||||||
|
'„': '"',
|
||||||
|
'″': '"',
|
||||||
|
'“': '"',
|
||||||
|
'”': '"',
|
||||||
|
'„': '"',
|
||||||
|
'″': '"',
|
||||||
|
'“':'"',
|
||||||
|
'”':'"',
|
||||||
|
'„':'"',
|
||||||
|
'″':'"',
|
||||||
|
'‘':"'",
|
||||||
|
'’':"'",
|
||||||
|
'′':"'",
|
||||||
|
'‘':"'",
|
||||||
|
'’':"'",
|
||||||
|
'′':"'",
|
||||||
|
'‘':"'",
|
||||||
|
'’':"'",
|
||||||
|
'′':"'",
|
||||||
|
```
|
||||||
|
* normalize quotation marks and punctuation in general
|
||||||
|
* https://unicode-table.com/en/sets/quotation-marks/
|
||||||
|
* https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
|
||||||
|
* https://www.fileformat.info/info/unicode/category/Po/list.htm
|
||||||
|
* https://www.gaijin.at/en/infos/unicode-character-table-punctuation
|
||||||
|
* ⁝
|
||||||
|
|
||||||
|
* cancel crawls that take too long
|
||||||
|
|
||||||
|
* search for "TODO" in code
|
||||||
|
|
||||||
|
* feedparser has support for JSON feeds since commit
|
||||||
|
a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
|
||||||
|
(as of 2020-10-26 in "develop" branch, not part of a release)
|
||||||
|
the version names are 'json1' and 'json11'
|
||||||
|
|
||||||
|
* allow site URLs with path, e.g.
|
||||||
|
https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
|
||||||
|
|
||||||
|
* add more languages
|
||||||
|
|
||||||
|
## Ideas
|
||||||
|
* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
|
||||||
|
|
||||||
|
* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
|
||||||
|
* [langid.py](https://github.com/saffsd/langid.py)
|
||||||
|
|
||||||
|
* [gain](https://github.com/gaojiuli/gain)
|
||||||
|
* [ruia](https://docs.python-ruia.org/)
|
||||||
|
* [demiurge](https://demiurge.readthedocs.io/)
|
||||||
|
* [cocrawler](https://github.com/cocrawler/cocrawler/)
|
||||||
|
* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
|
9
doc/source/development.rst
Normal file
9
doc/source/development.rst
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
Development
|
||||||
|
-----------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
devel/devel
|
||||||
|
devel/todo
|
||||||
|
devel/related_work
|
119
doc/source/elasticsearch.md
Normal file
119
doc/source/elasticsearch.md
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
# Howto elasticsearch
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
On the host (virtualization host) we need:
|
||||||
|
```
|
||||||
|
# cat /etc/sysctl.d/virtual_memory.conf
|
||||||
|
vm.max_map_count=262144
|
||||||
|
# sysctl -p /etc/sysctl.d/virtual_memory.conf
|
||||||
|
```
|
||||||
|
|
||||||
|
If this cannot be done, change this file after installing or upgrading elasticsearch:
|
||||||
|
```
|
||||||
|
/usr/lib/sysctl.d/elasticsearch.conf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### Install package
|
||||||
|
In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
|
||||||
|
|
||||||
|
We do a manual install. If you configure the apt repo instead, also think about setting
|
||||||
|
`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
|
||||||
|
|
||||||
|
```
|
||||||
|
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
|
||||||
|
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
|
||||||
|
shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
|
||||||
|
dpkg -i elasticsearch-7.15.2-amd64.deb
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable elasticsearch.service
|
||||||
|
systemctl start elasticsearch.service
|
||||||
|
```
|
||||||
|
|
||||||
|
First test:
|
||||||
|
```
|
||||||
|
http -j GET 127.0.0.1:9200/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Storage
|
||||||
|
|
||||||
|
```
|
||||||
|
systemctl stop elasticsearch.service
|
||||||
|
mv /var/lib/elasticsearch/ /srv/
|
||||||
|
systemctl start elasticsearch.service
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit /etc/elasticsearch/elasticsearch.yml
|
||||||
|
```
|
||||||
|
cluster.name: org.a-text.search
|
||||||
|
node.name: atext1
|
||||||
|
path.data: /srv/elasticsearch
|
||||||
|
path.logs: /var/log/elasticsearch
|
||||||
|
discovery.seed_hosts: ["atext1.multiname.org"]
|
||||||
|
xpack.security.enabled: true
|
||||||
|
xpack.security.authc.api_key.enabled: true
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
systemctl restart elasticsearch
|
||||||
|
```
|
||||||
|
|
||||||
|
The logfile now is at
|
||||||
|
```
|
||||||
|
/var/log/elasticsearch/org.a-text.search.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setup passwords
|
||||||
|
Setup passwords:
|
||||||
|
```
|
||||||
|
# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
|
||||||
|
Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
|
||||||
|
The passwords will be randomly generated and printed to the console.
|
||||||
|
Please confirm that you would like to continue [y/N]y
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy output to /etc/elasticsearch/passwords and
|
||||||
|
```
|
||||||
|
chmod 400 /etc/elasticsearch/passwords
|
||||||
|
```
|
||||||
|
|
||||||
|
Check login as user elastic:
|
||||||
|
```
|
||||||
|
http --auth elastic:************** -j GET http://127.0.0.1:9200/
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory limitation
|
||||||
|
To limit memory usage
|
||||||
|
```
|
||||||
|
mkdir /etc/systemd/system/elasticsearch.service.d
|
||||||
|
cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
|
||||||
|
[Service]
|
||||||
|
LimitMEMLOCK=8G
|
||||||
|
|
||||||
|
systemctl stop elasticsearch
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl start elasticsearch
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
and restart the service.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Some useful requests:
|
||||||
|
|
||||||
|
### List indices
|
||||||
|
```
|
||||||
|
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
|
||||||
|
```
|
||||||
|
### Health
|
||||||
|
```
|
||||||
|
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
|
||||||
|
```
|
||||||
|
### Node attributes
|
||||||
|
```
|
||||||
|
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
|
||||||
|
```
|
||||||
|
### Create API key
|
||||||
|
```
|
||||||
|
http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
|
||||||
|
```
|
37
doc/source/index.rst
Normal file
37
doc/source/index.rst
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
atextcrawler
|
||||||
|
============
|
||||||
|
|
||||||
|
atextcrawler is an asynchronous webcrawler indexing text
|
||||||
|
for literal and semantic search.
|
||||||
|
|
||||||
|
Its client-side counterpart is atextsearch_.
|
||||||
|
|
||||||
|
atextcrawler crawls and indexes selected websites.
|
||||||
|
It starts from a few seed sites and follows their external links.
|
||||||
|
Criteria defined in plugin code determine which linked sites (and
|
||||||
|
which of their resources) are (recursively) added to the pool.
|
||||||
|
|
||||||
|
atextcrawler is written in Python, runs a configurable number of
|
||||||
|
async workers concurrently (in one process), uses tensorflow for
|
||||||
|
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||||
|
and stores metadata in PostgreSQL and texts in elasticsearch.
|
||||||
|
|
||||||
|
.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:caption: Contents:
|
||||||
|
|
||||||
|
introduction
|
||||||
|
installation
|
||||||
|
maintenance
|
||||||
|
development
|
||||||
|
reference/modules
|
||||||
|
|
||||||
|
|
||||||
|
Indices and tables
|
||||||
|
==================
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
||||||
|
* :ref:`modindex`
|
||||||
|
* :ref:`search`
|
122
doc/source/installation.md
Normal file
122
doc/source/installation.md
Normal file
|
@ -0,0 +1,122 @@
|
||||||
|
# Installation
|
||||||
|
Installation was only tested on Debian bullseye (on amd64).
|
||||||
|
The instructions below are for this system.
|
||||||
|
(Please adapt to other environments.)
|
||||||
|
|
||||||
|
## System packages
|
||||||
|
```
|
||||||
|
apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
|
||||||
|
```
|
||||||
|
The protobuf packages are required for python package gcld3 (see below).
|
||||||
|
|
||||||
|
## PostgreSQL database
|
||||||
|
We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
|
||||||
|
```
|
||||||
|
createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
|
||||||
|
```
|
||||||
|
|
||||||
|
## Elasticsearch
|
||||||
|
We need access to an elasticsearch instance (over TCP/IP).
|
||||||
|
|
||||||
|
Note: TLS is not yet supported, so install this service locally.
|
||||||
|
|
||||||
|
See [elasticsearch howto](elasticsearch.md).
|
||||||
|
|
||||||
|
## Tensorflow model server
|
||||||
|
We need access to a tensorflow model server (over TCP/IP).
|
||||||
|
It should serve `universal_sentence_encoder_multilingual`
|
||||||
|
or a similar language model.
|
||||||
|
|
||||||
|
Note: TLS is not yet supported, so install this service locally.
|
||||||
|
|
||||||
|
See [tensorflow howto](tensorflow_model_server.md).
|
||||||
|
|
||||||
|
## Setup virtualenv and install atextcrawler
|
||||||
|
```
|
||||||
|
apt install python3-pip
|
||||||
|
adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
|
||||||
|
su - atextcrawler
|
||||||
|
cat >>.bashrc <<EOF
|
||||||
|
export PYTHONPATH=\$HOME/repo/src
|
||||||
|
EOF
|
||||||
|
pip3 install --user pipenv
|
||||||
|
cat >>.profile <<EOF
|
||||||
|
PYTHONPATH=\$HOME/repo/src
|
||||||
|
PATH=\$HOME/.local/bin:$PATH
|
||||||
|
\$HOME/.local/bin/pipenv shell
|
||||||
|
EOF
|
||||||
|
exit
|
||||||
|
su - atextcrawler
|
||||||
|
git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
|
||||||
|
cd repo
|
||||||
|
pipenv sync
|
||||||
|
pipenv install --site-packages # for systemd
|
||||||
|
pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: One of the dependencies, Python package `tldextract`,
|
||||||
|
uses this directory for caching:
|
||||||
|
```
|
||||||
|
$HOME/.cache/python-tldextract/
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configure atextcrawler
|
||||||
|
As user `atextcrawler` execute
|
||||||
|
```
|
||||||
|
mkdir $HOME/.config
|
||||||
|
cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit `$HOME/.config/atextcrawler/main.yaml`.
|
||||||
|
|
||||||
|
If you want to override a plugin, copy it to the plugins directory
|
||||||
|
and edit it, e.g.
|
||||||
|
```
|
||||||
|
cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
|
||||||
|
```
|
||||||
|
|
||||||
|
Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
|
||||||
|
|
||||||
|
Check (and print) the instance configuration:
|
||||||
|
```
|
||||||
|
python -m atextcrawler.config
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test run
|
||||||
|
To see if it works, run `atextcrawler` from the command line:
|
||||||
|
```
|
||||||
|
python -m atextcrawler
|
||||||
|
```
|
||||||
|
You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
|
||||||
|
|
||||||
|
## Install systemd service
|
||||||
|
To make the service persistent, create a systemd unit file
|
||||||
|
`/etc/systemd/system/atextcrawler.service` with this content:
|
||||||
|
```
|
||||||
|
[Unit]
|
||||||
|
Description=atextcrawler web crawler
|
||||||
|
Documentation=https://gitea.multiname.org/a-text/atextcrawler
|
||||||
|
Requires=network.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=atextcrawler
|
||||||
|
Group=atextcrawler
|
||||||
|
WorkingDirectory=/srv/atextcrawler/repo
|
||||||
|
Environment=PYTHONPATH=/srv/atextcrawler/repo/src
|
||||||
|
ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
|
||||||
|
TimeoutStartSec=30
|
||||||
|
ExecStop=/bin/kill -INT $MAINPID
|
||||||
|
TimeoutStopSec=180
|
||||||
|
Restart=on-failure
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
and
|
||||||
|
```
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable atextcrawler
|
||||||
|
systemctl start atextcrawler
|
||||||
|
```
|
66
doc/source/introduction.md
Normal file
66
doc/source/introduction.md
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
# Introduction
|
||||||
|
|
||||||
|
## What atextcrawler does:
|
||||||
|
* Start from a seed (white+black-)list of website base URLs
|
||||||
|
* Loop over sites selected by applying criteria to the content
|
||||||
|
of the site's start page
|
||||||
|
* Crawl the site, i.e. loop over resources of the site
|
||||||
|
* Extract plaintext content from the resource (html parsing is
|
||||||
|
optimized for html5); discard non-text content, but handle feeds
|
||||||
|
and sitemaps
|
||||||
|
* Extract internal and external links; external links contribute
|
||||||
|
to the site list
|
||||||
|
* Keep track of the sites and resources in a PostgreSQL database
|
||||||
|
* Store plaintext content of resources in an Elasticsearch index
|
||||||
|
* Store vector embeddings of plaintexts also in Elasticsearch
|
||||||
|
using tensorflow model server with a multilingual language model
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
There is only one python process running concurrently.
|
||||||
|
We use asyncio where possible (almost everywhere).
|
||||||
|
|
||||||
|
1. There is a queue of websites, see database table `site_queue`.
|
||||||
|
The queue is fed a) on first startup with seeds, b) manually
|
||||||
|
and c) from crawls which find external links.
|
||||||
|
When the queued is handled new sites are stored to table `site`.
|
||||||
|
New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
|
||||||
|
After the queue has been handled there is a delay
|
||||||
|
(`crawl.site_delay` seconds) before repetition.
|
||||||
|
1. Updating a site means: the start page is fetched and
|
||||||
|
criteria are applied to its content to determine whether
|
||||||
|
the site is relevant. (It is assumed that (non-)relevance is
|
||||||
|
obvious from the start page already.) If the site is relevant,
|
||||||
|
more information is fetched (e.g. sitemaps).
|
||||||
|
1. There is s a configurable number of crawler workers (config
|
||||||
|
`crawl.workers`) which concurrently crawl sites, one at a time
|
||||||
|
per worker. (During the crawl the site is marked as locked using
|
||||||
|
crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
|
||||||
|
Each crawl (with begin time, end time, number of found (new)
|
||||||
|
resources)) is stored in table `crawl`.
|
||||||
|
1. Crawls are either full crawls (including all paths reachable
|
||||||
|
through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
|
||||||
|
Feed crawls can happen more frequently (e.g. daily).
|
||||||
|
1. When a path is fetched it can result in a MetaResource (feed or
|
||||||
|
sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
|
||||||
|
1. If a MetaResource is fetched and it is a sitemap, its paths are
|
||||||
|
added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
|
||||||
|
1. Links between sites are stored in table `site_link`.
|
||||||
|
|
||||||
|
## Site annotations
|
||||||
|
Database table `site_annotation` can have any number of annotations
|
||||||
|
for a base_url. While crawling, these annotations are considered:
|
||||||
|
Blacklisting or whitelisting has precedence over function `site_filter`
|
||||||
|
(in plugin `filter_site`).
|
||||||
|
|
||||||
|
Annotations cannot be managed from within atextcrawler;
|
||||||
|
this requires another application, usually [`atextsearch`](https://TODO).
|
||||||
|
|
||||||
|
Each annotation requires a base_url of the annotated site and
|
||||||
|
if a site with this base_url exists in the `site` table,
|
||||||
|
it should also be associated with the site's id (column `site_id`).
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
* atextcrawler is not optimized for speed; it is meant to be run as a
|
||||||
|
background task on a server with limited resources
|
||||||
|
(or even an SBC, like raspberry pi, with attached storage)
|
||||||
|
* atextcrawler only indexes text, no other resources like images
|
23
doc/source/maintenance.md
Normal file
23
doc/source/maintenance.md
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
# Maintenance
|
||||||
|
|
||||||
|
## Upgrading
|
||||||
|
```
|
||||||
|
su - atextcrawler
|
||||||
|
pip3 install --user --upgrade pipenv
|
||||||
|
cd repo
|
||||||
|
git pull
|
||||||
|
pipenv sync
|
||||||
|
systemctl restart atextcrawler
|
||||||
|
```
|
||||||
|
|
||||||
|
## Update tldextract
|
||||||
|
From time to time run (in the Python virtualenv):
|
||||||
|
```
|
||||||
|
tldextract --update
|
||||||
|
```
|
||||||
|
or
|
||||||
|
```
|
||||||
|
systemctl stop atextcrawler
|
||||||
|
rm -r $HOME/.cache/python-tldextract
|
||||||
|
systemctl start atextcrawler
|
||||||
|
```
|
98
doc/source/tensorflow_model_server.md
Normal file
98
doc/source/tensorflow_model_server.md
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
# Tensorflow model server
|
||||||
|
|
||||||
|
## Setup server
|
||||||
|
Prepare:
|
||||||
|
```
|
||||||
|
apt install gnupg2
|
||||||
|
```
|
||||||
|
Add repo:
|
||||||
|
```
|
||||||
|
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
|
||||||
|
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
|
||||||
|
```
|
||||||
|
Install package:
|
||||||
|
```
|
||||||
|
apt update
|
||||||
|
apt install tensorflow-model-server
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setup models
|
||||||
|
```
|
||||||
|
mkdir -p /srv/tensorflow/workdir
|
||||||
|
mkdir -p /srv/tensorflow/models
|
||||||
|
```
|
||||||
|
Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
|
||||||
|
```
|
||||||
|
# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
|
||||||
|
mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||||
|
cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||||
|
wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
|
||||||
|
tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
|
||||||
|
rm universal-sentence-encoder-multilingual_3.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
Check:
|
||||||
|
```
|
||||||
|
tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
|
||||||
|
```
|
||||||
|
|
||||||
|
Config file `/srv/tensorflow/config`:
|
||||||
|
```
|
||||||
|
model_config_list: {
|
||||||
|
config: {
|
||||||
|
name: "sentences",
|
||||||
|
base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
|
||||||
|
model_platform: "tensorflow"
|
||||||
|
model_version_policy: {latest{}},
|
||||||
|
},
|
||||||
|
config: {
|
||||||
|
... (next model)
|
||||||
|
},
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Systemd integration
|
||||||
|
Edit /etc/systemd/system/tensorflow.service
|
||||||
|
```
|
||||||
|
[Unit]
|
||||||
|
Description=tensorflow model server
|
||||||
|
After=network.target auditd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/srv/tensorflow/workdir
|
||||||
|
ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
|
||||||
|
KillMode=process
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=30s
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
```
|
||||||
|
and
|
||||||
|
```
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable tensorflow
|
||||||
|
systemctl start tensorflow
|
||||||
|
```
|
||||||
|
|
||||||
|
Check:
|
||||||
|
```
|
||||||
|
http -j GET http://localhost:9000/v1/models/sentences
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
Show model details:
|
||||||
|
```
|
||||||
|
http -j GET http://localhost:9000/v1/models/sentences/metadata
|
||||||
|
```
|
||||||
|
|
||||||
|
## Docs
|
||||||
|
|
||||||
|
* `/usr/bin/tensorflow_model_server --help`
|
||||||
|
* https://github.com/tensorflow/serving/
|
||||||
|
* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
|
||||||
|
* https://github.com/hey-car/tensorflow-model-server
|
||||||
|
|
||||||
|
Datasets:
|
||||||
|
* https://www.tensorflow.org/datasets/catalog/overview
|
48
license.txt
Normal file
48
license.txt
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
ANTI-AUTHORITARIAN LICENSE version 1.0
|
||||||
|
________________________________________________________________________________
|
||||||
|
|
||||||
|
Obviously, this license is relevant to all who are bound by law.
|
||||||
|
|
||||||
|
The licensee ("you") must not be a commercial, military, clerical or
|
||||||
|
governmental entity. For this license the term "software" means the program
|
||||||
|
code, documentation as well as other data (for instance, language files).
|
||||||
|
|
||||||
|
Subject to the respective terms and conditions described below the licensee
|
||||||
|
is granted the non-exclusive and non-transferable license to:
|
||||||
|
A. make copies of the software
|
||||||
|
B. create derivative works ("modifications")
|
||||||
|
C. install and run copies or modifications of the software on any number of
|
||||||
|
servers, thereby making them usable for the licensee and possibly others
|
||||||
|
D. offer or give copies or modifications of the software, or parts of the
|
||||||
|
unmodified or modified software to others
|
||||||
|
|
||||||
|
For these permissions the respective conditions stated below must be met:
|
||||||
|
* For permission A condition 1 must be met.
|
||||||
|
* For permission B all of the conditions 1, 3, 4 must be met.
|
||||||
|
* For permission C all of the conditions 2, 3 must be met.
|
||||||
|
* For permission D all of the conditions 1, 2, 3, 4, 5 must be met.
|
||||||
|
|
||||||
|
These are the conditions:
|
||||||
|
1. You include this copyright notice and license in any copy or modification.
|
||||||
|
In files that contain a reference to it you preserve this reference.
|
||||||
|
2. You do not use this software or any modification of it for any commercial
|
||||||
|
purpose or for monetary gain, and also not for any military, governmental
|
||||||
|
or religious purpose; here with commercial purpose we mean activities which
|
||||||
|
have among their goals to make profit, be it monetary profit or any other
|
||||||
|
kind of profit that may entail or contribute to monetary profit.
|
||||||
|
3. Demos or screenshots of the modified or unmodified software must not be
|
||||||
|
published in any medium which requires the viewers to pay money in order
|
||||||
|
to see the contents; here money paid for mere internet connectivity (i.e.,
|
||||||
|
independent of the content supplier) is to be disregarded.
|
||||||
|
4. You do not impose any further restrictions on this software or any
|
||||||
|
derivative works beyond those restrictions herein.
|
||||||
|
5. The copy or modification must include source code, and must allow
|
||||||
|
distribution in source code as well as compiled form. The source code
|
||||||
|
must be the preferred form in which a programmer would modify the program.
|
||||||
|
Deliberately obfuscated source code is not allowed. Intermediate forms
|
||||||
|
such as the output of a preprocessor or translator are not allowed.
|
||||||
|
|
||||||
|
For this license itself, if re-used for other software, the following
|
||||||
|
copyright and license applies (copyheart license):
|
||||||
|
|
||||||
|
♡ Copying is an act of love. Please copy.
|
10
pyproject.toml
Normal file
10
pyproject.toml
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
# TOML formatted file; see PEP 518
|
||||||
|
|
||||||
|
[tool.isort]
|
||||||
|
profile = "black"
|
||||||
|
#multi_line_output = 3
|
||||||
|
|
||||||
|
[tool.black]
|
||||||
|
line-length = 79
|
||||||
|
target_version = ['py39']
|
||||||
|
skip-string-normalization = true
|
0
src/atextcrawler/__init__.py
Normal file
0
src/atextcrawler/__init__.py
Normal file
12
src/atextcrawler/__main__.py
Normal file
12
src/atextcrawler/__main__.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
"""
|
||||||
|
atextcrawler application execution entry point.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from .application import Application
|
||||||
|
from .config import Config
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
config = Config().get()
|
||||||
|
asyncio.run(Application(config).run())
|
204
src/atextcrawler/application.py
Normal file
204
src/atextcrawler/application.py
Normal file
|
@ -0,0 +1,204 @@
|
||||||
|
"""
|
||||||
|
atextcrawler application.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import importlib
|
||||||
|
import logging
|
||||||
|
import signal
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from systemd.journal import JournalHandler
|
||||||
|
|
||||||
|
from .config import Config
|
||||||
|
from .crawl import CrawlWorker
|
||||||
|
from .db import PGPool
|
||||||
|
from .search import shutdown_engine, startup_engine
|
||||||
|
from .site import load_seeds, process_site_queue
|
||||||
|
|
||||||
|
plugin_names = ['filter_site', 'filter_site_path', 'filter_resource_path']
|
||||||
|
|
||||||
|
|
||||||
|
class Application:
|
||||||
|
"""
|
||||||
|
atextcrawler application.
|
||||||
|
|
||||||
|
The basic structure of the application is this:
|
||||||
|
* one site crawler works just on the site_queue: fetching start pages
|
||||||
|
of sites and storing updated site information in table sites
|
||||||
|
* N other CrawlWorkers each do this in a loop:
|
||||||
|
checkout a site that is due for crawl and crawl its resources;
|
||||||
|
they fill the site_queue
|
||||||
|
"""
|
||||||
|
|
||||||
|
running = True
|
||||||
|
|
||||||
|
def __init__(self, config=None):
|
||||||
|
if config is None:
|
||||||
|
config = Config().get()
|
||||||
|
self.config = config
|
||||||
|
self.instance_name = config['instance_name']
|
||||||
|
self.instance_type = config['instance_type']
|
||||||
|
log_level = getattr(
|
||||||
|
logging, config['log_level'].upper(), logging.CRITICAL
|
||||||
|
)
|
||||||
|
self.logger = logging.getLogger('atextcrawler')
|
||||||
|
self.logger.setLevel(log_level)
|
||||||
|
if self.instance_type == 'dev':
|
||||||
|
self.logger.addHandler(logging.StreamHandler())
|
||||||
|
else:
|
||||||
|
self.logger.addHandler(
|
||||||
|
JournalHandler(SYSLOG_IDENTIFIER=self.instance_name)
|
||||||
|
)
|
||||||
|
self.logger.propagate = False
|
||||||
|
self.channel = 'atextcrawler_' + self.config['instance_name']
|
||||||
|
msg = f'Instance "{self}" initializing'
|
||||||
|
self.logger.info(msg)
|
||||||
|
self.plugins = self._load_plugins()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.instance_name
|
||||||
|
|
||||||
|
def _load_plugins(self):
|
||||||
|
"""
|
||||||
|
Return a dict mapping plugin names to modules.
|
||||||
|
"""
|
||||||
|
modules = {}
|
||||||
|
old_path = sys.path
|
||||||
|
for name in plugin_names:
|
||||||
|
try:
|
||||||
|
plugins_dir = self.config['plugins_dir']
|
||||||
|
sys.path.insert(0, plugins_dir)
|
||||||
|
module = importlib.import_module(name)
|
||||||
|
msg = f'Loading plugin "{name}" from {plugins_dir}'
|
||||||
|
except:
|
||||||
|
module = importlib.import_module(
|
||||||
|
'atextcrawler.plugin_defaults.' + name
|
||||||
|
)
|
||||||
|
msg = f'Loading plugin "{name}" from default location'
|
||||||
|
self.logger.info(msg)
|
||||||
|
modules[name] = module
|
||||||
|
sys.path = old_path
|
||||||
|
return modules
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
"""
|
||||||
|
Application lifecycle.
|
||||||
|
"""
|
||||||
|
await asyncio.gather(self.wait_for_shutdown(), self.startup())
|
||||||
|
await self.shutdown()
|
||||||
|
|
||||||
|
async def startup(self):
|
||||||
|
"""
|
||||||
|
Asynchronous startup.
|
||||||
|
"""
|
||||||
|
msg = f'Instance "{self}" starting components'
|
||||||
|
self.logger.info(msg)
|
||||||
|
self.search_engine = await startup_engine(self.config)
|
||||||
|
self.pgpool = await PGPool(self.config['postgresql'])
|
||||||
|
self.pool = self.pgpool.pool
|
||||||
|
await load_seeds(self.config, self.pool)
|
||||||
|
await reset_site_locks(self.pool)
|
||||||
|
worker_count = self.config['crawl']['workers']
|
||||||
|
self.workers = []
|
||||||
|
for worker_number in range(worker_count):
|
||||||
|
worker = await CrawlWorker(self, worker_number, self.pool)
|
||||||
|
self.workers.append(worker)
|
||||||
|
worker_coros = [worker.run() for worker in self.workers]
|
||||||
|
await asyncio.gather(
|
||||||
|
process_site_queue(self, self.pool),
|
||||||
|
self.handle_notifications(),
|
||||||
|
*worker_coros,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def wait_for_shutdown(self):
|
||||||
|
"""
|
||||||
|
Create a shutdown event (:class:`asyncio.Event`) and wait for it.
|
||||||
|
|
||||||
|
The event will be set by a signal handler for SIGINT
|
||||||
|
and SIGTERM signals (see :meth:`Application.handle_shutdown_signal`).
|
||||||
|
"""
|
||||||
|
self.shutdown_event = asyncio.Event()
|
||||||
|
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||||
|
asyncio.get_running_loop().add_signal_handler(
|
||||||
|
sig, self.handle_shutdown_signal
|
||||||
|
)
|
||||||
|
self.logger.debug(f'{self} waiting for shutdown event')
|
||||||
|
await self.shutdown_event.wait()
|
||||||
|
self.logger.info(f'Instance "{self}" shutdown event')
|
||||||
|
|
||||||
|
def handle_shutdown_signal(self):
|
||||||
|
"""
|
||||||
|
Handle shutdown signal.
|
||||||
|
"""
|
||||||
|
if self.shutdown_event.is_set():
|
||||||
|
return
|
||||||
|
self.shutdown_event.set()
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
async def shutdown(self):
|
||||||
|
"""
|
||||||
|
Asynchronous shutdown.
|
||||||
|
"""
|
||||||
|
self.logger.debug(f'Instance "{self}" shutting down')
|
||||||
|
await self.notify_conn.remove_listener(
|
||||||
|
self.channel, self.listen_callback
|
||||||
|
)
|
||||||
|
await self.pool.release(self.notify_conn)
|
||||||
|
for worker in self.workers:
|
||||||
|
await worker.shutdown()
|
||||||
|
await shutdown_engine(self.search_engine)
|
||||||
|
await self.pgpool.shutdown()
|
||||||
|
self.logger.info(f'Instance "{self}" shutdown completed')
|
||||||
|
|
||||||
|
async def handle_notifications(self):
|
||||||
|
"""
|
||||||
|
Handle notifications using PostgreSQL's NOTIFY/LISTEN.
|
||||||
|
"""
|
||||||
|
self.notify_conn = await self.pool.acquire()
|
||||||
|
await self.notify_conn.add_listener(self.channel, self.listen_callback)
|
||||||
|
|
||||||
|
def listen_callback(self, *args):
|
||||||
|
"""
|
||||||
|
Handle notify event from PostgreSQL.
|
||||||
|
"""
|
||||||
|
channel = args[2]
|
||||||
|
if channel != self.channel:
|
||||||
|
return
|
||||||
|
message = args[3]
|
||||||
|
if message.startswith('site_update '):
|
||||||
|
try:
|
||||||
|
site_id = int(message.removeprefix('site_update '))
|
||||||
|
for worker in self.workers:
|
||||||
|
if worker.site and site_id == worker.site.id_:
|
||||||
|
msg = (
|
||||||
|
f'Cancelling worker {worker.worker_number}'
|
||||||
|
f' (site={site_id}) due to site_update'
|
||||||
|
)
|
||||||
|
self.logger.info(msg)
|
||||||
|
worker.running = False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def sleep(self, duration, t_slice=3):
|
||||||
|
"""
|
||||||
|
Sleep for *duration* seconds while self.running.
|
||||||
|
|
||||||
|
Check self.running every *t_slice* seconds.
|
||||||
|
"""
|
||||||
|
remaining = duration
|
||||||
|
while remaining > 0 and self.running:
|
||||||
|
await asyncio.sleep(min(t_slice, remaining))
|
||||||
|
remaining -= t_slice
|
||||||
|
|
||||||
|
|
||||||
|
async def reset_site_locks(pool):
|
||||||
|
"""
|
||||||
|
Remove locks leftover from last run: Set crawl_active=false for all sites.
|
||||||
|
|
||||||
|
This is relevant when the application was not shutdown properly (e.g.
|
||||||
|
when the process was killed).
|
||||||
|
"""
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true"
|
||||||
|
await conn.execute(sql)
|
7
src/atextcrawler/assets/iana_langs
Normal file
7
src/atextcrawler/assets/iana_langs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
The recommended language tags to use in webpages are from
|
||||||
|
the IANA Language Subtag Registry (BCP47), see:
|
||||||
|
https://www.w3.org/International/questions/qa-html-language-declarations
|
||||||
|
https://r12a.github.io/app-subtags/
|
||||||
|
|
||||||
|
|
||||||
|
wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_ | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //'
|
219
src/atextcrawler/assets/iso_639-1
Normal file
219
src/atextcrawler/assets/iso_639-1
Normal file
|
@ -0,0 +1,219 @@
|
||||||
|
aa
|
||||||
|
ab
|
||||||
|
ae
|
||||||
|
af
|
||||||
|
ak
|
||||||
|
am
|
||||||
|
an
|
||||||
|
ar
|
||||||
|
as
|
||||||
|
av
|
||||||
|
ay
|
||||||
|
az
|
||||||
|
ba
|
||||||
|
be
|
||||||
|
bg
|
||||||
|
bh
|
||||||
|
bi
|
||||||
|
bm
|
||||||
|
bn
|
||||||
|
bo
|
||||||
|
br
|
||||||
|
bs
|
||||||
|
ca
|
||||||
|
ca
|
||||||
|
ce
|
||||||
|
ch
|
||||||
|
co
|
||||||
|
cr
|
||||||
|
cs
|
||||||
|
cu
|
||||||
|
cu
|
||||||
|
cu
|
||||||
|
cu
|
||||||
|
cu
|
||||||
|
cv
|
||||||
|
cy
|
||||||
|
da
|
||||||
|
de
|
||||||
|
dv
|
||||||
|
dv
|
||||||
|
dv
|
||||||
|
dz
|
||||||
|
ee
|
||||||
|
el
|
||||||
|
en
|
||||||
|
eo
|
||||||
|
es
|
||||||
|
es
|
||||||
|
et
|
||||||
|
eu
|
||||||
|
fa
|
||||||
|
ff
|
||||||
|
fi
|
||||||
|
fj
|
||||||
|
fo
|
||||||
|
fr
|
||||||
|
fy
|
||||||
|
ga
|
||||||
|
gd
|
||||||
|
gd
|
||||||
|
gl
|
||||||
|
gn
|
||||||
|
gu
|
||||||
|
gv
|
||||||
|
ha
|
||||||
|
he
|
||||||
|
hi
|
||||||
|
ho
|
||||||
|
hr
|
||||||
|
ht
|
||||||
|
ht
|
||||||
|
hu
|
||||||
|
hy
|
||||||
|
hz
|
||||||
|
ia
|
||||||
|
id
|
||||||
|
ie
|
||||||
|
ie
|
||||||
|
ig
|
||||||
|
ii
|
||||||
|
ii
|
||||||
|
ik
|
||||||
|
io
|
||||||
|
is
|
||||||
|
it
|
||||||
|
iu
|
||||||
|
ja
|
||||||
|
jv
|
||||||
|
ka
|
||||||
|
kg
|
||||||
|
ki
|
||||||
|
ki
|
||||||
|
kj
|
||||||
|
kj
|
||||||
|
kk
|
||||||
|
kl
|
||||||
|
kl
|
||||||
|
km
|
||||||
|
kn
|
||||||
|
ko
|
||||||
|
kr
|
||||||
|
ks
|
||||||
|
ku
|
||||||
|
kv
|
||||||
|
kw
|
||||||
|
ky
|
||||||
|
ky
|
||||||
|
la
|
||||||
|
lb
|
||||||
|
lb
|
||||||
|
lg
|
||||||
|
li
|
||||||
|
li
|
||||||
|
li
|
||||||
|
ln
|
||||||
|
lo
|
||||||
|
lt
|
||||||
|
lu
|
||||||
|
lv
|
||||||
|
mg
|
||||||
|
mh
|
||||||
|
mi
|
||||||
|
mk
|
||||||
|
ml
|
||||||
|
mn
|
||||||
|
mr
|
||||||
|
ms
|
||||||
|
mt
|
||||||
|
my
|
||||||
|
na
|
||||||
|
nb
|
||||||
|
nb
|
||||||
|
nd
|
||||||
|
nd
|
||||||
|
ne
|
||||||
|
ng
|
||||||
|
nl
|
||||||
|
nl
|
||||||
|
nn
|
||||||
|
nn
|
||||||
|
no
|
||||||
|
nr
|
||||||
|
nr
|
||||||
|
nv
|
||||||
|
nv
|
||||||
|
ny
|
||||||
|
ny
|
||||||
|
ny
|
||||||
|
oc
|
||||||
|
oj
|
||||||
|
om
|
||||||
|
or
|
||||||
|
os
|
||||||
|
os
|
||||||
|
pa
|
||||||
|
pa
|
||||||
|
pi
|
||||||
|
pl
|
||||||
|
ps
|
||||||
|
ps
|
||||||
|
pt
|
||||||
|
qu
|
||||||
|
rm
|
||||||
|
rn
|
||||||
|
ro
|
||||||
|
ro
|
||||||
|
ro
|
||||||
|
ru
|
||||||
|
rw
|
||||||
|
sa
|
||||||
|
sc
|
||||||
|
sd
|
||||||
|
se
|
||||||
|
sg
|
||||||
|
si
|
||||||
|
si
|
||||||
|
sk
|
||||||
|
sl
|
||||||
|
sm
|
||||||
|
sn
|
||||||
|
so
|
||||||
|
sq
|
||||||
|
sr
|
||||||
|
ss
|
||||||
|
st
|
||||||
|
su
|
||||||
|
sv
|
||||||
|
sw
|
||||||
|
ta
|
||||||
|
te
|
||||||
|
tg
|
||||||
|
th
|
||||||
|
ti
|
||||||
|
tk
|
||||||
|
tl
|
||||||
|
tn
|
||||||
|
to
|
||||||
|
tr
|
||||||
|
ts
|
||||||
|
tt
|
||||||
|
tw
|
||||||
|
ty
|
||||||
|
ug
|
||||||
|
ug
|
||||||
|
uk
|
||||||
|
ur
|
||||||
|
uz
|
||||||
|
ve
|
||||||
|
vi
|
||||||
|
vo
|
||||||
|
wa
|
||||||
|
wo
|
||||||
|
xh
|
||||||
|
yi
|
||||||
|
yo
|
||||||
|
za
|
||||||
|
za
|
||||||
|
zh
|
||||||
|
zu
|
10000
src/atextcrawler/assets/top_1e4
Normal file
10000
src/atextcrawler/assets/top_1e4
Normal file
File diff suppressed because it is too large
Load diff
337
src/atextcrawler/config.py
Normal file
337
src/atextcrawler/config.py
Normal file
|
@ -0,0 +1,337 @@
|
||||||
|
"""
|
||||||
|
Configuration loader and validator.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from io import TextIOBase
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
from voluptuous import All
|
||||||
|
from voluptuous import Any as VAny
|
||||||
|
from voluptuous import Invalid, Length, Range, Required, Schema, Url
|
||||||
|
from yaml import load
|
||||||
|
|
||||||
|
try:
|
||||||
|
from yaml import CLoader as Loader # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
from yaml import Loader # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigError(Exception):
|
||||||
|
"""
|
||||||
|
Application configuration error.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, err):
|
||||||
|
self.msg = str(err)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f'Application configuration error: {self.msg}'
|
||||||
|
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
"""
|
||||||
|
Application configuration.
|
||||||
|
|
||||||
|
Access the full application configuration using :meth:`get`.
|
||||||
|
|
||||||
|
It is a dictionary with these keys:
|
||||||
|
|
||||||
|
* 'directory': the configuration directory being used
|
||||||
|
* 'main': the main configuration from main.yaml, but
|
||||||
|
postgresql configuration may be overriden by environment
|
||||||
|
variable ATEXTCRAWLER_POSTGRESQL
|
||||||
|
"""
|
||||||
|
|
||||||
|
config = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get(
|
||||||
|
cls,
|
||||||
|
out: Optional[TextIOBase] = None,
|
||||||
|
) -> Optional[dict]:
|
||||||
|
"""
|
||||||
|
Load and validate app configuration if not already done; return it.
|
||||||
|
|
||||||
|
On errors print them to *out* and if out is sys.stdout, then
|
||||||
|
also exit with exit code 2. Otherwise just return None.
|
||||||
|
"""
|
||||||
|
if cls.config:
|
||||||
|
return cls.config
|
||||||
|
if out is None:
|
||||||
|
out = sys.stdout # type: ignore
|
||||||
|
_config = _load_config()
|
||||||
|
msg = None
|
||||||
|
if isinstance(_config, ConfigError):
|
||||||
|
msg = f'ERROR: configuration could not be loaded: {_config}'
|
||||||
|
else:
|
||||||
|
config = _validate_config(_config)
|
||||||
|
if isinstance(config, ConfigError):
|
||||||
|
config_dir = _config.get('config_dir')
|
||||||
|
msg = (
|
||||||
|
f'ERROR: invalid configuration in {config_dir}:'
|
||||||
|
f' {config}'
|
||||||
|
)
|
||||||
|
if isinstance(_config, ConfigError) or isinstance(config, ConfigError):
|
||||||
|
print(msg, file=out)
|
||||||
|
if out == sys.stdout:
|
||||||
|
sys.exit(2)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
config['postgresql']['min_size'] = config['crawl']['workers'] + 2
|
||||||
|
config['postgresql']['max_size'] = config['crawl']['workers'] + 2
|
||||||
|
cls.config = config
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config() -> Union[ConfigError, dict]:
|
||||||
|
"""
|
||||||
|
Load configuration; search in multiple directories.
|
||||||
|
|
||||||
|
We search these locations; the first location containing main.yaml
|
||||||
|
will be used::
|
||||||
|
|
||||||
|
* a directory defined in environment variable ATEXTCRAWLER_CONF
|
||||||
|
* subdir .config/atextcrawler in the user's home (`$HOME`)
|
||||||
|
* /etc/atextcrawler
|
||||||
|
|
||||||
|
In the same directory where this main.conf is located a subdirectory
|
||||||
|
'plugins' must exist and contain the configurations of plugins.
|
||||||
|
|
||||||
|
On failure return the first error and None.
|
||||||
|
Otherwise return None and a dict with these keys:
|
||||||
|
|
||||||
|
* `directory`: the used configuration directory
|
||||||
|
* `main`: the main application configuration
|
||||||
|
* `plugins`: a dict mapping plugins names to plugin configurations
|
||||||
|
"""
|
||||||
|
Path(__file__).parent.parent
|
||||||
|
config_dirs = []
|
||||||
|
if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'):
|
||||||
|
config_dirs.append(Path(env_conf))
|
||||||
|
if env_home := os.environ.get('HOME'):
|
||||||
|
config_dirs.append(Path(env_home) / '.config' / 'atextcrawler')
|
||||||
|
config_dirs.append(Path('/etc/atextcrawler'))
|
||||||
|
for config_dir in config_dirs:
|
||||||
|
main_yaml_path = config_dir / 'main.yaml'
|
||||||
|
if main_yaml_path.exists():
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
locs = ', '.join([str(loc) for loc in config_dirs if loc])
|
||||||
|
msg = (
|
||||||
|
f'Missing main.yaml in all config locations: {locs}\n'
|
||||||
|
f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR'
|
||||||
|
f' to define a custom config directory.'
|
||||||
|
)
|
||||||
|
return ConfigError(msg)
|
||||||
|
|
||||||
|
# load main.yaml
|
||||||
|
try:
|
||||||
|
with main_yaml_path.open() as main_yaml:
|
||||||
|
main_config = load(main_yaml.read(), Loader=Loader)
|
||||||
|
except Exception as err:
|
||||||
|
return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}')
|
||||||
|
|
||||||
|
# main_config must be a dict
|
||||||
|
if not isinstance(main_config, dict):
|
||||||
|
return ConfigError(f'File {main_yaml_path} must contain a dictionary')
|
||||||
|
|
||||||
|
# postgresql config from environment has precedence
|
||||||
|
postgresql_config = _get_env_postgresql()
|
||||||
|
if isinstance(postgresql_config, ConfigError):
|
||||||
|
return postgresql_config
|
||||||
|
main_config['postgresql'] = postgresql_config or main_config['postgresql']
|
||||||
|
|
||||||
|
main_config['config_dir'] = str(config_dir)
|
||||||
|
return main_config
|
||||||
|
|
||||||
|
|
||||||
|
def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]:
|
||||||
|
"""
|
||||||
|
Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL.
|
||||||
|
|
||||||
|
Return an error or the PostgreSQL config (which can be None if
|
||||||
|
the environment variable is not defined.
|
||||||
|
"""
|
||||||
|
env_var = 'ATEXTCRAWLER_POSTGRESQL'
|
||||||
|
value = os.environ.get(env_var, '').strip()
|
||||||
|
if not value:
|
||||||
|
return None
|
||||||
|
param_names = (
|
||||||
|
'host',
|
||||||
|
'port',
|
||||||
|
'database',
|
||||||
|
'user',
|
||||||
|
'password',
|
||||||
|
'schema_name',
|
||||||
|
)
|
||||||
|
re_dsn = re.compile(
|
||||||
|
'((' + '|'.join(param_names) + ')'
|
||||||
|
'=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes
|
||||||
|
'|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes
|
||||||
|
'|([^"\' ]*)' # value unquoted
|
||||||
|
')( |$))+?'
|
||||||
|
)
|
||||||
|
params = {}
|
||||||
|
for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value):
|
||||||
|
params[varname] = (
|
||||||
|
v3
|
||||||
|
or (v1 or '').replace('\\"', '"')
|
||||||
|
or (v2 or '').replace("\\'", "'")
|
||||||
|
)
|
||||||
|
if 'host' not in params:
|
||||||
|
params['host'] = 'localhost'
|
||||||
|
if 'port' not in params:
|
||||||
|
params['port'] = '5432'
|
||||||
|
if 'schema_name' not in params:
|
||||||
|
params['schema_name'] = 'public'
|
||||||
|
for name in param_names:
|
||||||
|
if name not in params:
|
||||||
|
return ConfigError(
|
||||||
|
f'Missing {name} in environment variable {env_var}'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
params['port'] = int(params['port'])
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_config(config: Any) -> Union[ConfigError, dict]:
|
||||||
|
"""
|
||||||
|
Validate the given configuration and fill in default values.
|
||||||
|
|
||||||
|
If invalid, return only the first error.
|
||||||
|
Otherwise return the configuration with added default values.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return schema_main(config)
|
||||||
|
except Exception as err:
|
||||||
|
return ConfigError(err)
|
||||||
|
|
||||||
|
|
||||||
|
def plugins_dir(config):
|
||||||
|
"""
|
||||||
|
Validate plugins directory (absolute or relative path).
|
||||||
|
|
||||||
|
If it is a relative path, prepend the config_dir.
|
||||||
|
"""
|
||||||
|
config_dir = config['config_dir']
|
||||||
|
plugins_dir = config['plugins_dir']
|
||||||
|
if plugins_dir.startswith('/'):
|
||||||
|
try:
|
||||||
|
plugins_dir = Path(plugins_dir)
|
||||||
|
except:
|
||||||
|
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
plugins_dir = str(Path(config_dir) / Path(plugins_dir))
|
||||||
|
config['plugins_dir'] = plugins_dir
|
||||||
|
except:
|
||||||
|
raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found')
|
||||||
|
if not (Path(plugins_dir) / '__init__.py').exists():
|
||||||
|
raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"')
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def postgresql_identifier(value):
|
||||||
|
"""
|
||||||
|
Validate a PostgreSQL identifier.
|
||||||
|
"""
|
||||||
|
if not isinstance(value, str) or not re.match(
|
||||||
|
'^[a-z][a-z0-9_]{0,30}$', value
|
||||||
|
):
|
||||||
|
raise Invalid(
|
||||||
|
f'Invalid PostgreSQL identifier "{value}", '
|
||||||
|
f'pattern must be: [a-z][a-z0-9_]{0,30}'
|
||||||
|
)
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
def positive_number(value):
|
||||||
|
"""
|
||||||
|
Validate a positive number (int or float).
|
||||||
|
"""
|
||||||
|
if (isinstance(value, int) or isinstance(value, float)) and value > 0:
|
||||||
|
return value
|
||||||
|
raise Invalid('Not a positive number')
|
||||||
|
|
||||||
|
|
||||||
|
schema_postgresql = Schema(
|
||||||
|
{
|
||||||
|
Required('host'): All(str, Length(min=1)),
|
||||||
|
Required('port', default=5432): All(int, Range(min=0, max=65535)),
|
||||||
|
Required('database'): All(str, Length(min=1)),
|
||||||
|
Required('user'): All(str, Length(min=1)),
|
||||||
|
Required('password'): str,
|
||||||
|
Required('schema_name', default='public'): postgresql_identifier,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
schema_crawl = Schema(
|
||||||
|
{
|
||||||
|
Required('workers', default=10): All(int, Range(min=0, max=1000)),
|
||||||
|
Required('site_delay', default=600): positive_number,
|
||||||
|
Required('site_revisit_interval', default=3600): positive_number,
|
||||||
|
Required('resource_delay', default=5): positive_number,
|
||||||
|
Required('full_crawl_interval', default=864000): positive_number,
|
||||||
|
Required('feed_crawl_interval', default=86400): positive_number,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
schema_elasticsearch = Schema(
|
||||||
|
{
|
||||||
|
Required('host'): All(str, Length(min=1)),
|
||||||
|
Required('api_key'): All(str, Length(min=1)),
|
||||||
|
Required('id'): All(str, Length(min=1)),
|
||||||
|
Required('index_base_name'): All(str, Length(min=1)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
schema_tensorflow = Schema(
|
||||||
|
{
|
||||||
|
Required('model_server_endpoint'): Url(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
schema_main = Schema(
|
||||||
|
All(
|
||||||
|
{
|
||||||
|
Required('config_dir'): All(str, Length(min=1)),
|
||||||
|
Required(
|
||||||
|
'instance_name', default='atextcrawler'
|
||||||
|
): postgresql_identifier,
|
||||||
|
Required('instance_type', default='prod'): VAny(
|
||||||
|
'dev',
|
||||||
|
'staging',
|
||||||
|
'prod',
|
||||||
|
),
|
||||||
|
Required('log_level', default='info'): VAny(
|
||||||
|
'critical',
|
||||||
|
'error',
|
||||||
|
'warning',
|
||||||
|
'info',
|
||||||
|
'debug',
|
||||||
|
),
|
||||||
|
Required('plugins_dir', default='plugins'): All(
|
||||||
|
str, Length(min=1)
|
||||||
|
),
|
||||||
|
Required('postgresql'): schema_postgresql,
|
||||||
|
Required('crawl'): schema_crawl,
|
||||||
|
Required('elasticsearch'): schema_elasticsearch,
|
||||||
|
Required('tensorflow'): schema_tensorflow,
|
||||||
|
},
|
||||||
|
plugins_dir,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
pprint(Config().get())
|
215
src/atextcrawler/crawl.py
Normal file
215
src/atextcrawler/crawl.py
Normal file
|
@ -0,0 +1,215 @@
|
||||||
|
"""
|
||||||
|
Crawl a site.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
from .models import Crawl
|
||||||
|
from .resource import ResourceFetcher, get_site_path, process_site_path
|
||||||
|
from .site import (
|
||||||
|
RobotsInfo,
|
||||||
|
checkin_site,
|
||||||
|
checkout_site,
|
||||||
|
fetch_feeds,
|
||||||
|
process_site,
|
||||||
|
update_site,
|
||||||
|
)
|
||||||
|
from .tensorflow import TensorFlow
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlWorker:
|
||||||
|
"""
|
||||||
|
Worker fetching sites, crawling their resources and storing statistics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, app, worker_number, pool):
|
||||||
|
self.app = app
|
||||||
|
self.worker_number = worker_number
|
||||||
|
self.pool = pool
|
||||||
|
self.site_delay = self.app.config['crawl']['site_delay']
|
||||||
|
self.resource_delay = self.app.config['crawl']['resource_delay']
|
||||||
|
self.site = None
|
||||||
|
self.crawl = None
|
||||||
|
self.running = True # do crawl
|
||||||
|
|
||||||
|
def __await__(self):
|
||||||
|
return self.__ainit__().__await__()
|
||||||
|
|
||||||
|
async def __ainit__(self):
|
||||||
|
await self.startup()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def startup(self):
|
||||||
|
"""
|
||||||
|
Asynchronous startup.
|
||||||
|
"""
|
||||||
|
logger.info(f'Starting worker {self.worker_number}')
|
||||||
|
self.conn = await self.pool.acquire()
|
||||||
|
self.session = aiohttp.ClientSession()
|
||||||
|
self.fetcher = ResourceFetcher(self.session)
|
||||||
|
self.tf = TensorFlow(self.app, self.session)
|
||||||
|
|
||||||
|
async def shutdown(self):
|
||||||
|
"""
|
||||||
|
Asynchronous shutdown.
|
||||||
|
"""
|
||||||
|
logger.info(f'Shutting down worker {self.worker_number}')
|
||||||
|
await self.session.close()
|
||||||
|
await self.pool.release(self.conn)
|
||||||
|
|
||||||
|
async def run(self):
|
||||||
|
"""
|
||||||
|
Worker loop: fetch a site, crawl its resources and store statistics.
|
||||||
|
|
||||||
|
If no site needs to be crawled, sleep for self.site_delay seconds
|
||||||
|
(configured in crawl.site_delay).
|
||||||
|
"""
|
||||||
|
await self.app.sleep(2)
|
||||||
|
while self.app.running and self.running:
|
||||||
|
self.site, is_full, more = await checkout_site(self.app, self.conn)
|
||||||
|
if not self.site:
|
||||||
|
msg = f'Worker {self.worker_number}: sites exhausted'
|
||||||
|
logger.debug(msg)
|
||||||
|
if not more:
|
||||||
|
await self.app.sleep(self.site_delay)
|
||||||
|
continue
|
||||||
|
self.crawl = await get_or_create_crawl(
|
||||||
|
self.conn, self.site.id_, is_full
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
if is_full:
|
||||||
|
site_upd, _ = await update_site(
|
||||||
|
self.app,
|
||||||
|
self.fetcher,
|
||||||
|
self.conn,
|
||||||
|
self.site.base_url,
|
||||||
|
site=self.site,
|
||||||
|
)
|
||||||
|
if site_upd and site_upd.crawl_enabled:
|
||||||
|
self.site = site_upd
|
||||||
|
await process_site(
|
||||||
|
self.fetcher,
|
||||||
|
self.conn,
|
||||||
|
self.site,
|
||||||
|
)
|
||||||
|
elif self.site.crawl_enabled:
|
||||||
|
await fetch_feeds(self.fetcher, self.conn, self.site)
|
||||||
|
if self.site.crawl_enabled:
|
||||||
|
await self.crawl_resources()
|
||||||
|
except:
|
||||||
|
msg = (
|
||||||
|
f'Worker {self.worker_number} failed crawl'
|
||||||
|
f' {self.crawl.id_} of site {self.site.id_}'
|
||||||
|
f' ({self.site.base_url})'
|
||||||
|
)
|
||||||
|
logger.exception(msg)
|
||||||
|
await self.crawl.finish(
|
||||||
|
self.conn, self.app.running and self.running
|
||||||
|
)
|
||||||
|
await checkin_site(self.app, self.conn, self.site, self.crawl)
|
||||||
|
msg = (
|
||||||
|
f'Worker {self.worker_number} finished crawl'
|
||||||
|
f' {self.crawl.id_}'
|
||||||
|
)
|
||||||
|
logger.debug(msg)
|
||||||
|
self.site = None
|
||||||
|
# if we were cancelled, but the app is still running, run again
|
||||||
|
if self.app.running:
|
||||||
|
self.running = True
|
||||||
|
msg = f'Closing crawler {self.worker_number}'
|
||||||
|
logger.debug(msg)
|
||||||
|
|
||||||
|
async def crawl_resources(self):
|
||||||
|
"""
|
||||||
|
Loop over resources of the site and process them. Collect statistics.
|
||||||
|
|
||||||
|
All workers operate on distinct sites, so no need for locking here.
|
||||||
|
"""
|
||||||
|
crawl_type = 'full' if self.crawl.is_full else 'feed'
|
||||||
|
msg = (
|
||||||
|
f'Worker {self.worker_number} beginning'
|
||||||
|
f' {crawl_type} crawl {self.crawl.id_}'
|
||||||
|
f' of site {self.site.id_} ({self.site.base_url})'
|
||||||
|
)
|
||||||
|
logger.info(msg)
|
||||||
|
resource_delay = self.resource_delay
|
||||||
|
robots = await RobotsInfo(self.site.base_url)
|
||||||
|
if robots.delay:
|
||||||
|
resource_delay = robots.delay
|
||||||
|
while self.app.running and self.running:
|
||||||
|
site_path = await get_site_path(
|
||||||
|
self.conn,
|
||||||
|
self.site,
|
||||||
|
self.crawl.t_begin,
|
||||||
|
only_new=not self.crawl.is_full,
|
||||||
|
)
|
||||||
|
if not site_path:
|
||||||
|
msg = (
|
||||||
|
f'Worker {self.worker_number} ending crawl'
|
||||||
|
f' {self.crawl.id_}: paths exhausted'
|
||||||
|
)
|
||||||
|
logger.info(msg)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
sp_filter = self.app.plugins['filter_site_path'].sp_filter
|
||||||
|
if sp_filter(self.site, site_path.path, robots):
|
||||||
|
is_new_resource = await process_site_path(
|
||||||
|
self.app,
|
||||||
|
self.worker_number,
|
||||||
|
self.conn,
|
||||||
|
self.fetcher,
|
||||||
|
self.tf,
|
||||||
|
self.site,
|
||||||
|
site_path,
|
||||||
|
)
|
||||||
|
if is_new_resource:
|
||||||
|
self.crawl.n_resources_new += 1
|
||||||
|
if is_new_resource is not None:
|
||||||
|
self.crawl.n_resources += 1
|
||||||
|
await self.app.sleep(resource_delay)
|
||||||
|
else:
|
||||||
|
sql = (
|
||||||
|
"UPDATE site_path SET"
|
||||||
|
" last_visit=now() at time zone 'UTC',"
|
||||||
|
" filtered=true"
|
||||||
|
" WHERE id=$1"
|
||||||
|
)
|
||||||
|
await self.conn.execute(sql, site_path.id_)
|
||||||
|
except:
|
||||||
|
msg = (
|
||||||
|
f'Worker {self.worker_number} processing path failed'
|
||||||
|
f' in crawl {self.crawl.id_}: {site_path}'
|
||||||
|
)
|
||||||
|
logger.exception(msg)
|
||||||
|
site_path.ok_count -= 1
|
||||||
|
await site_path.save(self.conn)
|
||||||
|
msg = (
|
||||||
|
f'Worker {self.worker_number}: stopped crawl' f' {self.crawl.id_}'
|
||||||
|
)
|
||||||
|
logger.info(msg)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_or_create_crawl(conn, site_id, is_full=True) -> Crawl:
|
||||||
|
"""
|
||||||
|
Return a new or existing+unfinished crawl.
|
||||||
|
|
||||||
|
If an existing crawl is found, return it, disregarding whether
|
||||||
|
it is a full crawl or not.
|
||||||
|
"""
|
||||||
|
sql = "SELECT * FROM crawl WHERE site_id=$1 AND t_end is null LIMIT 1"
|
||||||
|
if row := await conn.fetchrow(sql, site_id):
|
||||||
|
return await Crawl().load_from_row(row)
|
||||||
|
else:
|
||||||
|
# create a new crawl
|
||||||
|
crawl = Crawl(
|
||||||
|
site_id=site_id,
|
||||||
|
is_full=is_full,
|
||||||
|
t_begin=datetime.utcnow(),
|
||||||
|
)
|
||||||
|
await crawl.save(conn)
|
||||||
|
return crawl
|
162
src/atextcrawler/db.py
Normal file
162
src/atextcrawler/db.py
Normal file
|
@ -0,0 +1,162 @@
|
||||||
|
"""
|
||||||
|
PostgreSQL connectivity.
|
||||||
|
|
||||||
|
PGPool can be used as context manager. It takes postgresql configuration
|
||||||
|
parameters and gives a connection pool.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from io import TextIOBase
|
||||||
|
from pathlib import Path
|
||||||
|
from traceback import format_exc
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from .utils.json import json_dumps, json_loads
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class PGPool:
|
||||||
|
"""
|
||||||
|
Database connectivity: Provide a connection pool.
|
||||||
|
|
||||||
|
Can be used either as async context manager (giving a pool),
|
||||||
|
or as a class using async init and the shutdown method and
|
||||||
|
having the pool attribute.
|
||||||
|
|
||||||
|
After startup self.pool contains a PostgreSQL connection pool
|
||||||
|
(instance of :class:`asyncpg.pool.Pool`).
|
||||||
|
|
||||||
|
Startup also runs schema migrations (cf. directory `migrations`).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
postgresql_config: dict,
|
||||||
|
out: TextIOBase = None,
|
||||||
|
check: bool = True,
|
||||||
|
) -> None:
|
||||||
|
self.conf = postgresql_config
|
||||||
|
self.out = out or sys.stdout
|
||||||
|
self.check = check
|
||||||
|
self.pool = None
|
||||||
|
|
||||||
|
def __await__(self):
|
||||||
|
return self.__ainit__().__await__()
|
||||||
|
|
||||||
|
async def __ainit__(self):
|
||||||
|
await self.__aenter__()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
"""
|
||||||
|
Return the connection pool after an optional check.
|
||||||
|
|
||||||
|
The check tests basic database access and runs missing migrations.
|
||||||
|
If the check fails, return None.
|
||||||
|
"""
|
||||||
|
pool_params = {
|
||||||
|
key: val
|
||||||
|
for key, val in self.conf.items()
|
||||||
|
if key
|
||||||
|
in (
|
||||||
|
'host',
|
||||||
|
'port',
|
||||||
|
'database',
|
||||||
|
'user',
|
||||||
|
'password',
|
||||||
|
'max_size',
|
||||||
|
'min_size',
|
||||||
|
)
|
||||||
|
}
|
||||||
|
pool_params['command_timeout'] = 30
|
||||||
|
self.pool = await asyncpg.create_pool(**pool_params, init=self._init)
|
||||||
|
if self.check:
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
if await self.check_or_migrate(conn):
|
||||||
|
return self.pool
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def _init(conn) -> None:
|
||||||
|
"""
|
||||||
|
Add JSON encoding and decoding to the given connection.
|
||||||
|
"""
|
||||||
|
await conn.set_type_codec(
|
||||||
|
'jsonb',
|
||||||
|
encoder=json_dumps,
|
||||||
|
decoder=json_loads,
|
||||||
|
schema='pg_catalog',
|
||||||
|
)
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc, tb) -> None:
|
||||||
|
"""
|
||||||
|
Close the connection pool.
|
||||||
|
"""
|
||||||
|
await self.shutdown()
|
||||||
|
|
||||||
|
async def shutdown(self):
|
||||||
|
"""
|
||||||
|
Close the pool.
|
||||||
|
"""
|
||||||
|
await self.pool.close()
|
||||||
|
|
||||||
|
async def check_or_migrate(self, conn: asyncpg.Connection) -> bool:
|
||||||
|
"""
|
||||||
|
Check database connectivity.
|
||||||
|
|
||||||
|
Return whether database connectivity is working.
|
||||||
|
"""
|
||||||
|
row = await conn.fetchrow('SELECT 1+1 AS result')
|
||||||
|
if not row or row.get('result') != 2:
|
||||||
|
msg = 'Database SELECT 1+1 not working; missing privileges?'
|
||||||
|
print(msg, file=self.out)
|
||||||
|
logger.critical(msg)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# determine current schema_version
|
||||||
|
try:
|
||||||
|
sql = "SELECT value::int FROM kvs WHERE key='schema_version'"
|
||||||
|
schema_version = await conn.fetchval(sql)
|
||||||
|
except:
|
||||||
|
schema_version = 0
|
||||||
|
|
||||||
|
# run missing migrations
|
||||||
|
migrations = get_migrations()
|
||||||
|
for number, text in sorted(migrations.items()):
|
||||||
|
if number > schema_version:
|
||||||
|
cmds = text.split('\n----\n')
|
||||||
|
for cmd in cmds:
|
||||||
|
if not cmd.strip():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
await conn.execute(cmd)
|
||||||
|
except:
|
||||||
|
msg = (
|
||||||
|
f'Exception during migration {number} in '
|
||||||
|
f'statement\n{cmd}'
|
||||||
|
)
|
||||||
|
print(msg, file=self.out)
|
||||||
|
logger.critical(msg)
|
||||||
|
print(format_exc(), file=self.out)
|
||||||
|
logger.critical(format_exc())
|
||||||
|
return False
|
||||||
|
|
||||||
|
# return success
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def get_migrations() -> Dict[int, str]:
|
||||||
|
"""
|
||||||
|
Return migrations (number and text content of migration file).
|
||||||
|
"""
|
||||||
|
migrations_dir = Path(__file__).parent / 'migrations'
|
||||||
|
migrations = {}
|
||||||
|
for migration_file in migrations_dir.glob('*.sql'):
|
||||||
|
migration_number = int(migration_file.name[:-4])
|
||||||
|
with migration_file.open() as mig_file:
|
||||||
|
content = mig_file.read()
|
||||||
|
migrations[migration_number] = content
|
||||||
|
return migrations
|
297
src/atextcrawler/migrations/1.sql
Normal file
297
src/atextcrawler/migrations/1.sql
Normal file
|
@ -0,0 +1,297 @@
|
||||||
|
CREATE TABLE kvs (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
|
||||||
|
key varchar(200) NOT NULL UNIQUE,
|
||||||
|
value jsonb
|
||||||
|
)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry';
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN kvs.key IS 'Key';
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN kvs.value IS 'Value';
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE kvs IS 'Simple key-value store';
|
||||||
|
----
|
||||||
|
INSERT INTO kvs (key, value) VALUES ('schema_version', '1');
|
||||||
|
----
|
||||||
|
CREATE TABLE site (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
canonical_url varchar(200),
|
||||||
|
base_url varchar(200) NOT NULL,
|
||||||
|
base_urls varchar(200)[] NOT NULL,
|
||||||
|
domains varchar(100)[],
|
||||||
|
ips inet[] NULL,
|
||||||
|
crawl_enabled bool NOT NULL DEFAULT false,
|
||||||
|
crawl_active bool NOT NULL DEFAULT false,
|
||||||
|
next_full_crawl timestamp,
|
||||||
|
next_feed_crawl timestamp,
|
||||||
|
last_update timestamp,
|
||||||
|
last_pub timestamp,
|
||||||
|
pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[],
|
||||||
|
alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
title varchar(200),
|
||||||
|
description varchar(2000),
|
||||||
|
keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[],
|
||||||
|
linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
meta_info jsonb NOT NULL DEFAULT '{}'::jsonb,
|
||||||
|
boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb
|
||||||
|
)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__base_url ON site (base_url)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__base_urls ON site (base_urls)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__domains ON site (domains)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__ips ON site (ips)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__next_full_crawl ON site (next_full_crawl)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__langs ON site (langs)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__title ON site (title)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__description ON site (description)
|
||||||
|
----
|
||||||
|
CREATE INDEX site__keywords ON site (keywords)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE site IS 'Website'
|
||||||
|
----
|
||||||
|
CREATE TABLE site_queue (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
src bigint NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||||
|
url varchar(200) NOT NULL,
|
||||||
|
link_text varchar(100),
|
||||||
|
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc')
|
||||||
|
)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_queue__url ON site_queue (url)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE site_queue IS 'Queued site URLs'
|
||||||
|
----
|
||||||
|
CREATE TABLE site_feed (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||||
|
url varchar(200) NOT NULL,
|
||||||
|
etag text,
|
||||||
|
modified varchar(50),
|
||||||
|
t_visit timestamp,
|
||||||
|
t_content timestamp,
|
||||||
|
version varchar(10),
|
||||||
|
title varchar(200),
|
||||||
|
description text,
|
||||||
|
fail_count smallint NOT NULL DEFAULT 0
|
||||||
|
)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_feed__site ON site_feed (site_id)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_feed__t_content ON site_feed (t_content)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.url IS 'URL of the feed'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival'
|
||||||
|
----
|
||||||
|
CREATE TABLE site_link (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||||
|
dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||||
|
t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'),
|
||||||
|
link_text varchar(100)
|
||||||
|
)
|
||||||
|
----
|
||||||
|
ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_link__src ON site_link (src)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_link__dst ON site_link (dst)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_link.src IS 'Source site'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_link.dst IS 'Destination site'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE site_link IS 'Cross-site link'
|
||||||
|
----
|
||||||
|
CREATE TABLE resource (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
simhash bigint,
|
||||||
|
content_type varchar(50),
|
||||||
|
last_change timestamp,
|
||||||
|
text_len int,
|
||||||
|
lang char(2),
|
||||||
|
title varchar(200),
|
||||||
|
summary varchar(2000)
|
||||||
|
)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)'
|
||||||
|
----
|
||||||
|
CREATE TABLE site_path (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||||
|
path varchar(400) NOT NULL,
|
||||||
|
last_visit timestamp,
|
||||||
|
filtered bool NOT NULL DEFAULT false,
|
||||||
|
ok_count smallint NOT NULL DEFAULT 0,
|
||||||
|
canonical bool,
|
||||||
|
resource_id bigint REFERENCES resource(id) ON DELETE CASCADE
|
||||||
|
)
|
||||||
|
----
|
||||||
|
ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_path__site_path ON site_path (site_id, path)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_path__resource ON site_path (resource_id)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_path.site_id IS 'Site id'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_path.path IS 'Path'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources'
|
||||||
|
----
|
||||||
|
CREATE TABLE crawl (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE,
|
||||||
|
is_full bool NOT NULL DEFAULT false,
|
||||||
|
t_begin timestamp,
|
||||||
|
t_end timestamp,
|
||||||
|
n_resources int NOT NULL DEFAULT 0,
|
||||||
|
n_resources_new int NOT NULL DEFAULT 0
|
||||||
|
)
|
||||||
|
----
|
||||||
|
CREATE INDEX crawl__site ON crawl (site_id)
|
||||||
|
----
|
||||||
|
CREATE INDEX crawl__t_begin ON crawl (t_begin)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE resource IS 'Crawl of resources on a site'
|
||||||
|
----
|
||||||
|
CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale')
|
||||||
|
----
|
||||||
|
COMMENT ON TYPE site_annotation_type IS 'Type of site annotation'
|
||||||
|
----
|
||||||
|
CREATE TABLE site_annotation (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
site_id bigint REFERENCES site(id) ON DELETE SET NULL,
|
||||||
|
base_url varchar(200) NOT NULL,
|
||||||
|
ann_type site_annotation_type NOT NULL,
|
||||||
|
ann_content JSONB,
|
||||||
|
t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc')
|
||||||
|
)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_annotation__site ON site_annotation (site_id)
|
||||||
|
----
|
||||||
|
CREATE INDEX site_annotation__base_url ON site_annotation (base_url)
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content'
|
||||||
|
----
|
||||||
|
COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update'
|
||||||
|
----
|
||||||
|
COMMENT ON TABLE site_annotation IS 'Manual annotations on a site'
|
610
src/atextcrawler/models.py
Normal file
610
src/atextcrawler/models.py
Normal file
|
@ -0,0 +1,610 @@
|
||||||
|
"""
|
||||||
|
Data Models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import InitVar, asdict, dataclass, field, fields
|
||||||
|
from datetime import date, datetime
|
||||||
|
from itertools import chain
|
||||||
|
from typing import Any, ClassVar, Optional
|
||||||
|
|
||||||
|
import tldextract
|
||||||
|
from asyncpg import Connection
|
||||||
|
|
||||||
|
from .search import delete_resource
|
||||||
|
from .utils.durl import Durl, get_url_variants
|
||||||
|
from .utils.link import extract_domain
|
||||||
|
from .utils.similarity import get_simhash, simhash_to_bigint
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ModelBase:
|
||||||
|
"""
|
||||||
|
Abstract base class for models.
|
||||||
|
|
||||||
|
Execute SQL to load, save, delete instances using asyncpg.
|
||||||
|
"""
|
||||||
|
|
||||||
|
table: ClassVar
|
||||||
|
id_: Optional[int] = 0
|
||||||
|
|
||||||
|
async def load(self, conn: Connection, id_: int) -> Optional[Any]:
|
||||||
|
"""
|
||||||
|
If loading fails, return None.
|
||||||
|
"""
|
||||||
|
sql = f"SELECT * FROM {self.table} WHERE id=$1"
|
||||||
|
row = await conn.fetchrow(sql, id_)
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return await self.load_from_row(row)
|
||||||
|
|
||||||
|
async def load_from_row(self, row):
|
||||||
|
"""
|
||||||
|
If row is None, return None.
|
||||||
|
"""
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
data = dict(row)
|
||||||
|
self.id_ = data.pop('id')
|
||||||
|
self.__init__(**data)
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def save(self, conn: Connection) -> None:
|
||||||
|
"""
|
||||||
|
Save the instance (update if self.id_ is set, else insert).
|
||||||
|
"""
|
||||||
|
data = asdict(self)
|
||||||
|
# logger.debug(f'Save {self}: id_={self.id_}')
|
||||||
|
if self.id_: # update
|
||||||
|
cols = ', '.join(data.keys())
|
||||||
|
upds = ', '.join(
|
||||||
|
[f'{col}=${i + 1}' for i, col in enumerate(data.keys())]
|
||||||
|
)
|
||||||
|
val_id = f'${len(data) + 1}'
|
||||||
|
sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}"
|
||||||
|
await conn.execute(sql, *data.values(), self.id_)
|
||||||
|
else: # insert
|
||||||
|
cols = ', '.join(data.keys())
|
||||||
|
vals = ', '.join([f'${i + 1}' for i in range(len(data))])
|
||||||
|
sql = (
|
||||||
|
f"INSERT INTO {self.table} ({cols}) VALUES ({vals})"
|
||||||
|
f" RETURNING id"
|
||||||
|
)
|
||||||
|
self.id_ = await conn.fetchval(sql, *data.values())
|
||||||
|
|
||||||
|
def asdict(self):
|
||||||
|
"""
|
||||||
|
Return instance data as dictionary.
|
||||||
|
"""
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
async def delete(self, conn: Connection) -> None:
|
||||||
|
"""
|
||||||
|
Delete the object if it has an id_.
|
||||||
|
"""
|
||||||
|
if self.id_:
|
||||||
|
sql = f"DELETE FROM {self.table} WHERE id=$1"
|
||||||
|
await conn.execute(sql, self.id_)
|
||||||
|
|
||||||
|
|
||||||
|
class ResourceError:
|
||||||
|
"""
|
||||||
|
Error encountered while trying to fetch a resource.
|
||||||
|
|
||||||
|
ResourceError is used for cases when fetching a resource fails.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, msg, status=None, headers=None):
|
||||||
|
self.msg = msg
|
||||||
|
self.status = status
|
||||||
|
self.headers = headers
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f'ResourceError: {self.msg}'
|
||||||
|
|
||||||
|
|
||||||
|
class ResourceRedirect:
|
||||||
|
"""
|
||||||
|
A resource containing a redirect.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, urls):
|
||||||
|
self.urls = urls
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TextResource(ModelBase):
|
||||||
|
"""
|
||||||
|
TextResource (without path).
|
||||||
|
|
||||||
|
TextResource models web resources with relevant text content.
|
||||||
|
They are instantiated in modules page, document, ...; their metadata
|
||||||
|
are stored in table `resource` and the text content is stored with the
|
||||||
|
search engine.
|
||||||
|
|
||||||
|
Do not confuse with SitePath: Several SitePath instances
|
||||||
|
may point to a TextResource. The TextResource holds the actual content.
|
||||||
|
|
||||||
|
If we are not dealing with the startpage of a new site,
|
||||||
|
the init_fields dict usually will contain the site to which
|
||||||
|
the resource belongs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
table: ClassVar = 'resource'
|
||||||
|
init_fields: InitVar[dict] = None # additional fields after fetching
|
||||||
|
search_fields: InitVar[dict] = None # additional fields for indexing
|
||||||
|
|
||||||
|
# database fields
|
||||||
|
simhash: Optional[int] = None
|
||||||
|
content_type: Optional[str] = None
|
||||||
|
last_change: Optional[datetime] = None
|
||||||
|
text_len: int = 0
|
||||||
|
lang: Optional[str] = None
|
||||||
|
title: Optional[str] = None
|
||||||
|
summary: Optional[str] = None
|
||||||
|
|
||||||
|
def __post_init__(self, init_fields, search_fields):
|
||||||
|
if init_fields is None:
|
||||||
|
init_fields = {}
|
||||||
|
self.init_fields = init_fields
|
||||||
|
if search_fields is None:
|
||||||
|
search_fields = {}
|
||||||
|
self.search_fields = search_fields
|
||||||
|
self.site = self.init_fields.get('site')
|
||||||
|
self.site_id = self.site.id_ if self.site else None
|
||||||
|
self._update_simhash()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return (
|
||||||
|
f'TextResource(id={self.id_},'
|
||||||
|
f' site_id={self.site_id},'
|
||||||
|
f' type={self.content_type})'
|
||||||
|
)
|
||||||
|
|
||||||
|
def _update_simhash(self):
|
||||||
|
"""
|
||||||
|
Update the simhash of the resource from its text content.
|
||||||
|
"""
|
||||||
|
if self.simhash is None:
|
||||||
|
text = self.search_fields.get('text', '')
|
||||||
|
self.simhash = simhash_to_bigint(get_simhash(text))
|
||||||
|
|
||||||
|
async def save(self, conn: Connection):
|
||||||
|
"""
|
||||||
|
Save the instance, extending the parent's method.
|
||||||
|
"""
|
||||||
|
self.content_type = (
|
||||||
|
self.content_type[:50] if self.content_type else None
|
||||||
|
)
|
||||||
|
self.title = self.title[:200] if self.title else None
|
||||||
|
self.summary = self.summary[:400] if self.summary else None
|
||||||
|
self._update_simhash()
|
||||||
|
if self.last_change is None:
|
||||||
|
self.last_change = datetime.utcnow()
|
||||||
|
await super().save(conn)
|
||||||
|
|
||||||
|
async def update_from_resource(self, upd: 'TextResource'):
|
||||||
|
"""
|
||||||
|
Update self with values from another resource.
|
||||||
|
"""
|
||||||
|
names = [field.name for field in fields(self)]
|
||||||
|
for name in names:
|
||||||
|
cur_val = getattr(self, name)
|
||||||
|
upd_val = getattr(upd, name)
|
||||||
|
if not cur_val and upd_val is not None:
|
||||||
|
setattr(self, name, upd_val)
|
||||||
|
init_names = [
|
||||||
|
'headers',
|
||||||
|
'redirects',
|
||||||
|
'links_int',
|
||||||
|
'links_ext',
|
||||||
|
'shortlinks',
|
||||||
|
'canonical',
|
||||||
|
#'head',
|
||||||
|
]
|
||||||
|
self.init_fields = upd.init_fields
|
||||||
|
self.search_fields = upd.search_fields
|
||||||
|
# for init_name in init_names:
|
||||||
|
# cur_val = self.init_fields.get(init_name)
|
||||||
|
# upd_val = upd.init_fields.get(init_name)
|
||||||
|
# if not cur_val and upd_val is not None:
|
||||||
|
# self.init_fields[init_name] = upd_val
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetaResource(ModelBase):
|
||||||
|
"""
|
||||||
|
Parent class for Feed, Sitemap, SitemapIndex.
|
||||||
|
|
||||||
|
MetaResource is a parent class for Feed, Sitemap, SitemapIndex.
|
||||||
|
Their instances are not stored. Note: class Feed contains feed meta data
|
||||||
|
and is stored in the database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SitemapIndex(MetaResource):
|
||||||
|
"""
|
||||||
|
A SitemapIndex meta resource.
|
||||||
|
|
||||||
|
Just a list of the siteap URLs, nothing more.
|
||||||
|
"""
|
||||||
|
|
||||||
|
sitemaps: list = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Sitemap(MetaResource):
|
||||||
|
"""
|
||||||
|
A Sitemap meta resource.
|
||||||
|
|
||||||
|
Just a list of the resulting links, nothing more.
|
||||||
|
"""
|
||||||
|
|
||||||
|
urls: list = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Feed(MetaResource):
|
||||||
|
"""
|
||||||
|
A site's feed (RSS, Atom , ...).
|
||||||
|
"""
|
||||||
|
|
||||||
|
table: ClassVar = 'site_feed'
|
||||||
|
entries: InitVar[list] = None
|
||||||
|
site_id: Optional[int] = None
|
||||||
|
url: Optional[str] = None
|
||||||
|
etag: Optional[str] = None
|
||||||
|
modified: Optional[str] = None
|
||||||
|
t_visit: Optional[datetime] = None
|
||||||
|
t_content: Optional[datetime] = None
|
||||||
|
version: Optional[str] = None
|
||||||
|
title: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
fail_count: int = 0
|
||||||
|
|
||||||
|
def __post_init__(self, entries):
|
||||||
|
self.entries = entries
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})'
|
||||||
|
|
||||||
|
async def save(self, conn: Connection):
|
||||||
|
"""
|
||||||
|
Save, trying to merge with existing entry matching on site_id and url.
|
||||||
|
"""
|
||||||
|
if not self.site_id or not self.url:
|
||||||
|
msg = f'Saving feed failed: missing site_id of url'
|
||||||
|
logger.error(msg)
|
||||||
|
return
|
||||||
|
sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2"
|
||||||
|
self.id_ = await conn.fetchval(sql, self.site_id, self.url)
|
||||||
|
await super().save(conn)
|
||||||
|
|
||||||
|
def debug(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the instance data asa string for debug print output.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
f'Feed:\n'
|
||||||
|
f'- id: {self.id_}\n'
|
||||||
|
f'- site_id: {self.site_id}\n'
|
||||||
|
f'- url: {self.url}\n'
|
||||||
|
f'- etag: {self.etag}\n'
|
||||||
|
f'- modified: {self.modified}\n'
|
||||||
|
f'- t_visit: {self.t_visit}\n'
|
||||||
|
f'- t_content: {self.t_content}\n'
|
||||||
|
f'- version: {self.version}\n'
|
||||||
|
f'- title: {self.title}\n'
|
||||||
|
f'- description: {self.description}\n'
|
||||||
|
f'- fail_count: {self.fail_count}\n'
|
||||||
|
f'- entries: {self.entries}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Site(ModelBase):
|
||||||
|
"""
|
||||||
|
Website.
|
||||||
|
"""
|
||||||
|
|
||||||
|
table: ClassVar = 'site'
|
||||||
|
base_durl: InitVar[Durl] = None
|
||||||
|
feeds: InitVar[dict] = None
|
||||||
|
links_ext: InitVar[dict] = None
|
||||||
|
links_int: InitVar[dict] = None
|
||||||
|
startpage_text: InitVar[str] = None
|
||||||
|
|
||||||
|
canonical_url: Optional[str] = None
|
||||||
|
base_url: Optional[str] = None
|
||||||
|
base_urls: list[str] = field(default_factory=list)
|
||||||
|
domains: list[str] = field(default_factory=list)
|
||||||
|
ips: Optional[list[str]] = None
|
||||||
|
crawl_enabled: bool = False
|
||||||
|
crawl_active: bool = False
|
||||||
|
next_full_crawl: Optional[datetime] = None
|
||||||
|
next_feed_crawl: Optional[datetime] = None
|
||||||
|
last_update: Optional[datetime] = None
|
||||||
|
last_pub: Optional[datetime] = None
|
||||||
|
pub_dates: Optional[dict[str, str]] = None
|
||||||
|
langs: list[str] = field(default_factory=list)
|
||||||
|
alt_langs: dict[str, str] = field(default_factory=dict)
|
||||||
|
title: Optional[str] = None
|
||||||
|
description: Optional[str] = None
|
||||||
|
keywords: list[str] = field(default_factory=list)
|
||||||
|
linkbacks: dict[str, str] = field(default_factory=dict)
|
||||||
|
meta_info: dict = field(default_factory=dict)
|
||||||
|
boilerplate_texts: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def __post_init__(
|
||||||
|
self,
|
||||||
|
base_durl: Durl,
|
||||||
|
feeds=None,
|
||||||
|
links_ext=None,
|
||||||
|
links_int=None,
|
||||||
|
startpage_text=None,
|
||||||
|
):
|
||||||
|
self.feeds = feeds
|
||||||
|
self.links_ext = links_ext
|
||||||
|
self.links_int = links_int
|
||||||
|
self.startpage_text = startpage_text
|
||||||
|
self.keywords = self.keywords[:20]
|
||||||
|
if not self.last_update:
|
||||||
|
self.last_update = datetime.utcnow()
|
||||||
|
pub_date: Optional[str]
|
||||||
|
if self.last_pub:
|
||||||
|
pub_date = date.isoformat(self.last_pub.date())
|
||||||
|
self.pub_dates = {date.isoformat(self.last_update): pub_date}
|
||||||
|
else:
|
||||||
|
pub_date = None
|
||||||
|
self.pub_dates = {}
|
||||||
|
if base_durl:
|
||||||
|
self.base_urls = [base_durl.url()[:200]]
|
||||||
|
self.domains = [extract_domain(base_durl.hostname)[:100]]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return (
|
||||||
|
f'Site(id={self.id_}, url={self.base_url},'
|
||||||
|
f' crawl_enabled={self.crawl_enabled})'
|
||||||
|
)
|
||||||
|
|
||||||
|
async def update_base_url(self) -> None:
|
||||||
|
"""
|
||||||
|
Update the base_url, choosing the most relevant URL.
|
||||||
|
|
||||||
|
If canonical_url is not None, use this.
|
||||||
|
Otherwise set self.base_url to the shortest from self.base_urls,
|
||||||
|
but requiring a https-url if there is at least one.
|
||||||
|
"""
|
||||||
|
if self.canonical_url and self.canonical_url not in self.base_urls:
|
||||||
|
if canonical_durl := await Durl(self.canonical_url):
|
||||||
|
self.base_urls.append(self.canonical_url)
|
||||||
|
domain = extract_domain(canonical_durl.hostname)
|
||||||
|
if domain not in self.domains:
|
||||||
|
self.domains.append(domain)
|
||||||
|
if self.canonical_url:
|
||||||
|
self.base_url = self.canonical_url
|
||||||
|
return
|
||||||
|
if not self.base_url:
|
||||||
|
url_candidates = self.base_urls
|
||||||
|
if https_urls := [
|
||||||
|
url for url in self.base_urls if url.startswith('https://')
|
||||||
|
]:
|
||||||
|
url_candidates = https_urls
|
||||||
|
self.base_url = min(url_candidates, key=len)
|
||||||
|
|
||||||
|
async def save( # type: ignore
|
||||||
|
self, conn, merge=True
|
||||||
|
) -> tuple[Optional[int], bool]:
|
||||||
|
"""
|
||||||
|
Store the site, optionally trying to merge it with an existing site.
|
||||||
|
|
||||||
|
Return the id of the saved instance and whether a new instance
|
||||||
|
was created.
|
||||||
|
|
||||||
|
If self.id_ is not 0, replace the data of the existing site with
|
||||||
|
this id. Else if not merge, store as new row, and if merge,
|
||||||
|
try to merge with an existing matching site.
|
||||||
|
"""
|
||||||
|
await self.update_base_url()
|
||||||
|
if not merge:
|
||||||
|
created = not bool(self.id_)
|
||||||
|
await super().save(conn)
|
||||||
|
return self.id_, created
|
||||||
|
if self.id_:
|
||||||
|
sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1"
|
||||||
|
row = await conn.fetchrow(sql, self.id_)
|
||||||
|
self.base_urls = list(
|
||||||
|
set(row['base_urls']).union(set(self.base_urls))
|
||||||
|
)
|
||||||
|
if previous_pub_dates := row['pub_dates']:
|
||||||
|
if not self.pub_dates:
|
||||||
|
self.pub_dates = {}
|
||||||
|
self.pub_dates.update(previous_pub_dates)
|
||||||
|
await super().save(conn)
|
||||||
|
return self.id_, False
|
||||||
|
same_site_id = await search_same_site(self, conn)
|
||||||
|
if same_site_id:
|
||||||
|
same_site = await Site().load(conn, same_site_id)
|
||||||
|
if same_site_id and same_site:
|
||||||
|
same_site.base_urls = set(same_site.base_urls).union(
|
||||||
|
set(self.base_urls)
|
||||||
|
)
|
||||||
|
same_site.domains = set(same_site.domains).union(set(self.domains))
|
||||||
|
if self.canonical_url and not same_site.canonical_url:
|
||||||
|
same_site.canonical_url = self.canonical_url
|
||||||
|
await same_site.save(conn, merge=False) # call ourselves
|
||||||
|
self.id_ = same_site.id_
|
||||||
|
return self.id_, False
|
||||||
|
else:
|
||||||
|
await super().save(conn)
|
||||||
|
return self.id_, True
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SitePath(ModelBase):
|
||||||
|
"""
|
||||||
|
Path of a website. May point to a Resource.
|
||||||
|
"""
|
||||||
|
|
||||||
|
table: ClassVar = 'site_path'
|
||||||
|
site: InitVar[str] = None
|
||||||
|
|
||||||
|
site_id: Optional[int] = None
|
||||||
|
path: Optional[str] = None
|
||||||
|
filtered: bool = False
|
||||||
|
last_visit: Optional[datetime] = None
|
||||||
|
ok_count: int = 0
|
||||||
|
canonical: Optional[bool] = None
|
||||||
|
resource_id: Optional[int] = None
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return (
|
||||||
|
f'SitePath(id={self.id_}, site_id={self.site_id},'
|
||||||
|
f' path={self.path})'
|
||||||
|
)
|
||||||
|
|
||||||
|
async def save(self, conn: Connection):
|
||||||
|
"""
|
||||||
|
Save the instance, extending the parent's method.
|
||||||
|
"""
|
||||||
|
self.path = self.path[:400] if self.path else ''
|
||||||
|
await super().save(conn)
|
||||||
|
|
||||||
|
async def unlink_resource(self, conn, engine, index_base_name):
|
||||||
|
"""
|
||||||
|
Unlink the resource and also delete it, if it has no more links.
|
||||||
|
"""
|
||||||
|
if self.id_:
|
||||||
|
if self.resource_id:
|
||||||
|
sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1"
|
||||||
|
ref_count = await conn.fetchval(sql, self.resource_id)
|
||||||
|
if ref_count == 0:
|
||||||
|
sql = (
|
||||||
|
"DELETE FROM resource WHERE id=$1"
|
||||||
|
" RETURNING (true, lang)"
|
||||||
|
)
|
||||||
|
found = await conn.fetchval(sql, self.resource_id)
|
||||||
|
if found:
|
||||||
|
await delete_resource(
|
||||||
|
engine, found[1], self.resource_id
|
||||||
|
)
|
||||||
|
self.resource_id = None
|
||||||
|
|
||||||
|
def url(self, site):
|
||||||
|
"""
|
||||||
|
Return the full URL (combine the site's base_url with our path).
|
||||||
|
"""
|
||||||
|
return site.base_url + self.path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Crawl(ModelBase):
|
||||||
|
"""
|
||||||
|
The crawl process of a website (begin, end, statistics, ...).
|
||||||
|
"""
|
||||||
|
|
||||||
|
table: ClassVar = 'crawl'
|
||||||
|
site_id: Optional[int] = None
|
||||||
|
is_full: bool = False
|
||||||
|
t_begin: datetime = datetime.utcnow()
|
||||||
|
t_end: Optional[datetime] = None
|
||||||
|
n_resources: int = 0
|
||||||
|
n_resources_new: int = 0
|
||||||
|
|
||||||
|
async def finish(self, conn, set_t_end):
|
||||||
|
"""
|
||||||
|
Save the crawl. Set t_end only if indicated.
|
||||||
|
"""
|
||||||
|
if set_t_end:
|
||||||
|
self.t_end = datetime.utcnow()
|
||||||
|
await self.save(conn)
|
||||||
|
|
||||||
|
|
||||||
|
async def search_same_site(
|
||||||
|
site: Site,
|
||||||
|
conn: Connection,
|
||||||
|
) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
Try to find a matching site for the given *site* and return its id.
|
||||||
|
|
||||||
|
TODO: if the path is non-trivial, require it also for the matching site
|
||||||
|
|
||||||
|
Two sites match when they return the same content for identical paths.
|
||||||
|
The base_url (scheme and/or netloc) may differ.
|
||||||
|
We do not have the content for all paths of both websites, so we need
|
||||||
|
to estimate: We only take into account meta information from the
|
||||||
|
start pages of both sites, in particular the title, description
|
||||||
|
and information obtained the base_urls:
|
||||||
|
|
||||||
|
We use a combination of these conditions:
|
||||||
|
|
||||||
|
1. one of the sites has a canonical URL which matches the
|
||||||
|
URL of the other site
|
||||||
|
2. the content fields (title, description) have sufficient information
|
||||||
|
3. the content fields match exactly
|
||||||
|
4. the domain matches
|
||||||
|
5. the domain matches, except for the TLD
|
||||||
|
6. the base_urls differ in their schemes (http vs. https)
|
||||||
|
7. the hostnames in the base_urls are identical
|
||||||
|
8. the hostnames in the base_urls differ by a prepended 'www.'
|
||||||
|
9. the IPs have at least one common address
|
||||||
|
|
||||||
|
The algorithm is this (first answer is final, yes means match):
|
||||||
|
|
||||||
|
* if (1) : yes
|
||||||
|
* if (2), (3), (4) : yes
|
||||||
|
* if (2), (3), (5), (9) : yes
|
||||||
|
* if (6), ((7) or (8)) : yes
|
||||||
|
* no
|
||||||
|
"""
|
||||||
|
# rule (1)
|
||||||
|
if site.canonical_url:
|
||||||
|
sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1"
|
||||||
|
id_ = await conn.fetchval(sql, site.canonical_url)
|
||||||
|
if id_:
|
||||||
|
return id_
|
||||||
|
else:
|
||||||
|
sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1"
|
||||||
|
id_ = await conn.fetchval(sql, site.base_urls)
|
||||||
|
if id_:
|
||||||
|
return id_
|
||||||
|
|
||||||
|
# rule (6), ((7) or (8))
|
||||||
|
url_variants = set(
|
||||||
|
chain.from_iterable(
|
||||||
|
get_url_variants(base_url) for base_url in site.base_urls
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1"
|
||||||
|
if id_ := await conn.fetchval(sql, url_variants):
|
||||||
|
return id_
|
||||||
|
|
||||||
|
# condition (2)
|
||||||
|
if len(site.title or '') > 15 or len(site.description or '') > 15:
|
||||||
|
sql = (
|
||||||
|
f"SELECT * FROM site WHERE"
|
||||||
|
f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2"
|
||||||
|
)
|
||||||
|
rows = await conn.fetch(sql, site.title or '', site.description or '')
|
||||||
|
# condition (3)
|
||||||
|
if rows:
|
||||||
|
# condition (4)
|
||||||
|
for row in rows:
|
||||||
|
domains = set(row.get('domains', []))
|
||||||
|
if domains & set(site.domains):
|
||||||
|
return row['id']
|
||||||
|
# condition (9)
|
||||||
|
for row in rows:
|
||||||
|
ips = set(row.get('ips', []))
|
||||||
|
if site.ips and ips & set(site.ips):
|
||||||
|
# condition (5)
|
||||||
|
domains_ = row.get('domains', [])
|
||||||
|
d1 = set([tldextract.extract(d).domain for d in domains_])
|
||||||
|
domains_ = site.domains or []
|
||||||
|
d2 = set([tldextract.extract(d).domain for d in domains_])
|
||||||
|
if d1 & d2:
|
||||||
|
return row['id']
|
||||||
|
|
||||||
|
return None
|
0
src/atextcrawler/plugin_defaults/__init__.py
Normal file
0
src/atextcrawler/plugin_defaults/__init__.py
Normal file
22
src/atextcrawler/plugin_defaults/filter_resource_path.py
Normal file
22
src/atextcrawler/plugin_defaults/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
Filter paths found in a resource.
|
||||||
|
|
||||||
|
This plugin implements :func:`rp_filter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
def rp_filter(site, durl) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Adjust or filter found paths (may depend on site).
|
||||||
|
|
||||||
|
To filter out a path (i.e., not add it to table `site_path`)
|
||||||
|
return None.
|
||||||
|
"""
|
||||||
|
path = durl.pwa()
|
||||||
|
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||||
|
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||||
|
return None
|
||||||
|
path = path.removesuffix('?amp=1')
|
||||||
|
return path
|
47
src/atextcrawler/plugin_defaults/filter_site.py
Normal file
47
src/atextcrawler/plugin_defaults/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
"""
|
||||||
|
Relevance estimation of sites.
|
||||||
|
|
||||||
|
This plugin implements :func:`site_filter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from atextcrawler.models import Site
|
||||||
|
|
||||||
|
MIN_RELEVANCE_SCORE = 5
|
||||||
|
|
||||||
|
|
||||||
|
async def site_filter(site: Site) -> bool:
|
||||||
|
"""
|
||||||
|
Assess relevance of the site (using language-dependent criteria).
|
||||||
|
|
||||||
|
If the site shall be crawled, return True, else False.
|
||||||
|
"""
|
||||||
|
# limit to sites in English or German language
|
||||||
|
if not set(['de', 'en']) & set(site.langs):
|
||||||
|
return False
|
||||||
|
score = 0.0
|
||||||
|
for crit_name, weight, langs, crit_re in re_criteria:
|
||||||
|
if '*' in langs or set(langs) & set(site.langs):
|
||||||
|
findings = crit_re.findall(site.startpage_text)
|
||||||
|
if findings:
|
||||||
|
score += weight * len(findings)
|
||||||
|
if site.title and crit_re.search(site.title):
|
||||||
|
score += 4 * weight
|
||||||
|
if site.description and crit_re.search(site.description):
|
||||||
|
score += 4 * weight
|
||||||
|
|
||||||
|
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||||
|
|
||||||
|
return score >= MIN_RELEVANCE_SCORE
|
||||||
|
|
||||||
|
|
||||||
|
re_criteria = {
|
||||||
|
(
|
||||||
|
'anarch',
|
||||||
|
1.0,
|
||||||
|
('*',),
|
||||||
|
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||||
|
),
|
||||||
|
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||||
|
}
|
24
src/atextcrawler/plugin_defaults/filter_site_path.py
Normal file
24
src/atextcrawler/plugin_defaults/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
"""
|
||||||
|
Plugin for filtering paths of a site to be retrieved.
|
||||||
|
|
||||||
|
This plugin implements :func:`sp_filter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def sp_filter(site, path, robots) -> bool:
|
||||||
|
"""
|
||||||
|
Per-site path filter. Return whether the path shall be retrieved.
|
||||||
|
"""
|
||||||
|
if not robots.can_fetch_url(site.base_url + path):
|
||||||
|
return False
|
||||||
|
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||||
|
if any(
|
||||||
|
[
|
||||||
|
path.endswith(end)
|
||||||
|
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||||
|
]
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
if '/bbselect?' in path:
|
||||||
|
return False
|
||||||
|
return True
|
10
src/atextcrawler/resource/__init__.py
Normal file
10
src/atextcrawler/resource/__init__.py
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
from .dedup import store_boilerplate_texts
|
||||||
|
from .feed import feed_types, update_feed
|
||||||
|
from .fetch import ResourceFetcher
|
||||||
|
from .operations import (
|
||||||
|
add_site_paths,
|
||||||
|
get_site_path,
|
||||||
|
process_site_path,
|
||||||
|
store_feed_entries,
|
||||||
|
)
|
||||||
|
from .sitemap import extract_sitemap_paths, get_sitemap_urls
|
96
src/atextcrawler/resource/__main__.py
Normal file
96
src/atextcrawler/resource/__main__.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
"""
|
||||||
|
Dev tool for fetching and displaying a resource.
|
||||||
|
|
||||||
|
Has no permanent effects.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from pprint import pformat
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
from ..models import Feed, TextResource
|
||||||
|
from ..resource import ResourceFetcher
|
||||||
|
from ..utils.annotation import pack_annotations, unpack_annotations
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug')
|
||||||
|
logger_page_debug.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
def add_tags(text, annotations):
|
||||||
|
"""
|
||||||
|
Reconstruct html from text and annotations.
|
||||||
|
|
||||||
|
This is very similar to what the client does when displaying
|
||||||
|
a cached hit.
|
||||||
|
"""
|
||||||
|
html = ''
|
||||||
|
opening_tags = defaultdict(list)
|
||||||
|
closing_tags = defaultdict(list)
|
||||||
|
anns_tags = sorted(
|
||||||
|
annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1])
|
||||||
|
)
|
||||||
|
for (i, f), anns in anns_tags:
|
||||||
|
opening_tags[i] += [tag for tag in reversed(anns)]
|
||||||
|
closing_tags[f] += [tag for tag in reversed(anns)]
|
||||||
|
positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys()))
|
||||||
|
last_pos = 0
|
||||||
|
links = {i: href for href, (i, f, rel) in annotations['links'].items()}
|
||||||
|
for pos in positions:
|
||||||
|
html += text[last_pos:pos]
|
||||||
|
closing = closing_tags.get(pos, [])
|
||||||
|
opening = opening_tags.get(pos, [])
|
||||||
|
common = set(closing) & set(opening)
|
||||||
|
closing = [tag for tag in closing if tag not in common]
|
||||||
|
opening = [tag for tag in opening if tag not in common]
|
||||||
|
tags_html = ''
|
||||||
|
for tag in reversed(closing):
|
||||||
|
html += f'</{tag}>\n'
|
||||||
|
for tag in opening:
|
||||||
|
if tag == 'a':
|
||||||
|
href = links.get(pos, '#')
|
||||||
|
html += f'<a href="{href}">'
|
||||||
|
else:
|
||||||
|
html += f'<{tag}>'
|
||||||
|
last_pos = pos
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
"""
|
||||||
|
Fetch and display a resource with URL given as cmdline argument.
|
||||||
|
"""
|
||||||
|
url = sys.argv[1]
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
if not (durl := await Durl(url)):
|
||||||
|
return
|
||||||
|
fetcher = ResourceFetcher(session)
|
||||||
|
resource = await fetcher.fetch(url)
|
||||||
|
if isinstance(resource, TextResource):
|
||||||
|
logger.warning(repr(resource))
|
||||||
|
logger.warning(f'Language: {resource.lang}')
|
||||||
|
logger.warning(pformat(resource.search_fields))
|
||||||
|
logger.warning(pformat(resource.init_fields))
|
||||||
|
|
||||||
|
# annotations = resource.search_fields.get('annotations')
|
||||||
|
# text = resource.search_fields['text']
|
||||||
|
# with open('/tmp/1.html', 'w') as f:
|
||||||
|
# html = add_tags(text, annotations)
|
||||||
|
# f.write(f'<html lang="de">\n<head><title>hhh</title></head>'
|
||||||
|
# f'<body>\n{html}\n</body></html>')
|
||||||
|
elif isinstance(resource, Feed):
|
||||||
|
logger.warning(resource.debug())
|
||||||
|
else:
|
||||||
|
logger.warning(f'Resource has type {type(resource)}')
|
||||||
|
logger.warning(resource)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(run())
|
59
src/atextcrawler/resource/dedup.py
Normal file
59
src/atextcrawler/resource/dedup.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
"""
|
||||||
|
Find boilerplate texts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
from ..models import TextResource
|
||||||
|
from ..utils.probe import extract_samples
|
||||||
|
from ..utils.section import iter_sections
|
||||||
|
|
||||||
|
|
||||||
|
async def store_boilerplate_texts(fetcher, conn, site):
|
||||||
|
"""
|
||||||
|
Find and store boilerplate texts of a site.
|
||||||
|
|
||||||
|
Fetch the start page and internal sample links obtained from it.
|
||||||
|
If there are sufficienty frequently appearing text sections,
|
||||||
|
consider them as boilerplate texts.
|
||||||
|
|
||||||
|
If boilerplate_texts were found, update the given site instance.
|
||||||
|
"""
|
||||||
|
startpage = await fetcher.fetch(site.base_url, site=site)
|
||||||
|
if (
|
||||||
|
not isinstance(startpage, TextResource)
|
||||||
|
or startpage.content_type != 'html'
|
||||||
|
):
|
||||||
|
return
|
||||||
|
|
||||||
|
# fetch sample resources
|
||||||
|
sample_links = extract_samples(startpage.init_fields['links_int'])
|
||||||
|
resources = [startpage]
|
||||||
|
for sample_link in sample_links:
|
||||||
|
if sample_link.path == site.base_url: # avoid duplicate resources
|
||||||
|
continue # NB: duplicate resources may have different paths
|
||||||
|
sample_resource = await fetcher.fetch(sample_link.url(), site=None)
|
||||||
|
if (
|
||||||
|
isinstance(sample_resource, TextResource)
|
||||||
|
and sample_resource.content_type == 'html'
|
||||||
|
):
|
||||||
|
resources.append(sample_resource)
|
||||||
|
|
||||||
|
# find common texts in resources
|
||||||
|
if (n_resources := len(resources)) > 2:
|
||||||
|
text_freq = Counter()
|
||||||
|
for resource in resources:
|
||||||
|
text = resource.search_fields['text']
|
||||||
|
semantic_breaks = resource.search_fields['annotations'][
|
||||||
|
'semantic_breaks'
|
||||||
|
]
|
||||||
|
for sec in iter_sections(text, semantic_breaks):
|
||||||
|
text_freq[sec[3]] += 1
|
||||||
|
boilerplate_texts = []
|
||||||
|
if min(text_freq.values() or [0]) == 1: # no resource fetched twice
|
||||||
|
for text, freq in text_freq.items():
|
||||||
|
if freq > 2:
|
||||||
|
boilerplate_texts.append(text)
|
||||||
|
sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2"
|
||||||
|
await conn.execute(sql, boilerplate_texts, site.id_)
|
||||||
|
site.boilerplate_texts = boilerplate_texts
|
131
src/atextcrawler/resource/document.py
Normal file
131
src/atextcrawler/resource/document.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
"""
|
||||||
|
Parse documents (often application/pdf).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from tika import parser
|
||||||
|
|
||||||
|
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
from ..utils.http import get_header_links
|
||||||
|
from ..utils.lang import extract_content_language
|
||||||
|
from .plaintext import annotate_text
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger_debug = logging.getLogger(__name__ + '.debug')
|
||||||
|
logger_debug.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
re_url = re.compile(
|
||||||
|
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
|
||||||
|
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_document(
|
||||||
|
durl: Durl,
|
||||||
|
resp: dict,
|
||||||
|
site: Optional[Site],
|
||||||
|
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
|
||||||
|
"""
|
||||||
|
Extract plain text from documents in various formats.
|
||||||
|
"""
|
||||||
|
content = resp['content']
|
||||||
|
|
||||||
|
# HTTP headers, canonical URL, shortlink
|
||||||
|
header_links = await get_header_links(resp['headers'], durl, site)
|
||||||
|
if canonical := header_links.get('canonical'):
|
||||||
|
if canonical != durl.url():
|
||||||
|
return ResourceRedirect(resp['redirects'] + [canonical])
|
||||||
|
shortlink = header_links.get('shortlink')
|
||||||
|
|
||||||
|
# use tika to extract text
|
||||||
|
doc = parser.from_buffer(content)
|
||||||
|
# logger.debug(pformat(doc))
|
||||||
|
if doc.get('status') != 200:
|
||||||
|
msg = f'Analyzing document failed: {durl.url()}'
|
||||||
|
return ResourceError(msg)
|
||||||
|
|
||||||
|
# collect meta data
|
||||||
|
meta = doc.get('metadata', {})
|
||||||
|
content_type = meta.get('Content-Type')
|
||||||
|
if isinstance(content_type, list):
|
||||||
|
content_type = content_type[-1]
|
||||||
|
title = concat(meta.get('title'))
|
||||||
|
concat(meta.get('creator'))
|
||||||
|
last_change = extract_latest(meta.get('date') or meta.get('created'))
|
||||||
|
keywords = None
|
||||||
|
|
||||||
|
# text content
|
||||||
|
text = (doc.get('content') or '').strip()
|
||||||
|
|
||||||
|
# links
|
||||||
|
links_int: dict[Durl, tuple[list[str], str]] = {}
|
||||||
|
links_ext: dict[Durl, tuple[list[str], str]] = {}
|
||||||
|
for url in re_url.findall(text):
|
||||||
|
link_durl = await Durl(url[0])
|
||||||
|
if link_durl:
|
||||||
|
if link_durl.site() == durl.site():
|
||||||
|
links_int[link_durl] = [], link_durl.url()
|
||||||
|
else:
|
||||||
|
links_ext[link_durl] = [], link_durl.url()
|
||||||
|
|
||||||
|
# annotations
|
||||||
|
text, annotations = annotate_text(text)
|
||||||
|
|
||||||
|
return TextResource(
|
||||||
|
content_type=content_type,
|
||||||
|
last_change=last_change,
|
||||||
|
text_len=len(text),
|
||||||
|
lang=extract_content_language(text),
|
||||||
|
title=title,
|
||||||
|
init_fields={
|
||||||
|
'durl': durl,
|
||||||
|
'site': site,
|
||||||
|
'headers': resp['headers'],
|
||||||
|
'redirects': resp['redirects'],
|
||||||
|
'links_int': links_int,
|
||||||
|
'links_ext': links_ext,
|
||||||
|
'shortlink': shortlink,
|
||||||
|
'canonical': None,
|
||||||
|
},
|
||||||
|
search_fields={
|
||||||
|
'title': title,
|
||||||
|
'pub_date': last_change,
|
||||||
|
'keywords': keywords,
|
||||||
|
'text': text,
|
||||||
|
'annotations': annotations,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
Extract the lastest date (if any) from a string or list of strings.
|
||||||
|
"""
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
if not isinstance(s, list):
|
||||||
|
s = [s]
|
||||||
|
dt = []
|
||||||
|
for t in s:
|
||||||
|
try:
|
||||||
|
dt.append(datetime.fromisoformat(t.rstrip('Z')))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return max(dt) if dt else None
|
||||||
|
|
||||||
|
|
||||||
|
def concat(s: Optional[Union[str, list]]) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Helper function for joining strings together.
|
||||||
|
"""
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
if not isinstance(s, list):
|
||||||
|
s = [s]
|
||||||
|
return ' '.join(s)
|
155
src/atextcrawler/resource/feed.py
Normal file
155
src/atextcrawler/resource/feed.py
Normal file
|
@ -0,0 +1,155 @@
|
||||||
|
"""
|
||||||
|
Stuff related to feeds.
|
||||||
|
|
||||||
|
Higher-level stuff is in site.feeds.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from asyncpg import Connection
|
||||||
|
from feedparser import parse
|
||||||
|
|
||||||
|
from ..models import Feed, MetaResource, ResourceError
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
feed_types = (
|
||||||
|
'application/rss+xml',
|
||||||
|
'application/atom+xml',
|
||||||
|
'application/feed+json',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]:
|
||||||
|
"""
|
||||||
|
Fetch, parse and return a given feed's content. Also update *feed*.
|
||||||
|
|
||||||
|
If the server replied with HTTP 410, delete the feed.
|
||||||
|
If there is no new information (server replied with HTTP 304),
|
||||||
|
return None. For other errors also return None and increase the
|
||||||
|
fail_count.
|
||||||
|
"""
|
||||||
|
headers = {'Cache-control': 'max-age=600'}
|
||||||
|
if feed.modified:
|
||||||
|
headers['If-Modified-Since'] = feed.modified
|
||||||
|
elif feed.etag:
|
||||||
|
headers['If-None-Match'] = feed.etag.removeprefix('W/')
|
||||||
|
resource = await fetcher.fetch(feed.url, headers=headers)
|
||||||
|
if isinstance(resource, ResourceError):
|
||||||
|
if resource.status == 410:
|
||||||
|
msg = f'Feed has vanished, deleting it: {feed}'
|
||||||
|
logger.debug(msg)
|
||||||
|
await feed.delete(conn)
|
||||||
|
if resource.status != 304:
|
||||||
|
feed.fail_count += 1
|
||||||
|
if feed.fail_count > 5:
|
||||||
|
msg = f'Feed not reachable, deleting it: {feed}'
|
||||||
|
logger.debug(msg)
|
||||||
|
await feed.delete(conn)
|
||||||
|
return None # HTTP 304, no new entries
|
||||||
|
elif isinstance(resource, Feed):
|
||||||
|
resource.id_ = feed.id_
|
||||||
|
resource.site_id = feed.site_id
|
||||||
|
await resource.save(conn)
|
||||||
|
return resource.entries
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_json_feed(resp, data: dict) -> Feed:
|
||||||
|
"""
|
||||||
|
Parse a JSON response for jsonfeed information.
|
||||||
|
|
||||||
|
TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1)
|
||||||
|
"""
|
||||||
|
feed = Feed()
|
||||||
|
feed.url = data.get('feed_url', resp['redirects'][-1])
|
||||||
|
feed.etag = resp['headers'].get('ETag')
|
||||||
|
feed.modified = resp['headers'].get('Last-Modified')
|
||||||
|
feed.t_visit = datetime.utcnow()
|
||||||
|
version = data.get('version', '')
|
||||||
|
version = 'json-' + version.removeprefix('https://jsonfeed.org/version/')
|
||||||
|
feed.version = version[:10]
|
||||||
|
feed.title = data.get('title')
|
||||||
|
feed.description = data.get('description')
|
||||||
|
feed.fail_count = 0
|
||||||
|
entries = []
|
||||||
|
latest = None
|
||||||
|
# parse feed entries to a dict compatible with feedparser's entries
|
||||||
|
for feed_item in data.get('items', []):
|
||||||
|
entry = {}
|
||||||
|
entry['link'] = feed_item.get('url')
|
||||||
|
dt = feed_item.get('date_published')
|
||||||
|
if dt:
|
||||||
|
dt = datetime.fromisoformat(dt) if dt else None
|
||||||
|
dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc)
|
||||||
|
entry['published_parsed'] = dt.timetuple()
|
||||||
|
entry['title'] = feed_item.get('title')
|
||||||
|
entry['summary'] = feed_item.get('summary')
|
||||||
|
entries.append(entry)
|
||||||
|
if dt:
|
||||||
|
latest = max(latest or dt, dt)
|
||||||
|
feed.entries = entries
|
||||||
|
feed.t_content = latest
|
||||||
|
return feed
|
||||||
|
|
||||||
|
|
||||||
|
def parse_xml_feed(resp) -> Union[Feed, ResourceError]:
|
||||||
|
"""
|
||||||
|
Parse a response from Fetcher.get_resp() for xml feed information.
|
||||||
|
"""
|
||||||
|
feed = Feed()
|
||||||
|
feed.url = resp['redirects'][-1]
|
||||||
|
feed.etag = resp['headers'].get('ETag')
|
||||||
|
feed.modified = resp['headers'].get('Last-Modified')
|
||||||
|
feed.t_visit = datetime.utcnow()
|
||||||
|
try:
|
||||||
|
parsed = parse(resp['content'], response_headers=resp['headers'])
|
||||||
|
except Exception as error:
|
||||||
|
return ResourceError(f'Feedparser error: {error}')
|
||||||
|
latest = parsed['feed'].get('updated_parsed')
|
||||||
|
if latest:
|
||||||
|
latest = datetime(*latest[:6])
|
||||||
|
feed.t_content = max(feed.t_content or latest, latest)
|
||||||
|
feed.version = parsed['version']
|
||||||
|
feed.title = parsed['feed'].get('title', '')[:200] or None
|
||||||
|
feed.description = parsed['feed'].get('description')
|
||||||
|
feed.fail_count = 0
|
||||||
|
feed.entries = parsed['entries']
|
||||||
|
return feed
|
||||||
|
|
||||||
|
|
||||||
|
def convert_feed_entries(
|
||||||
|
base_url: Optional[str],
|
||||||
|
entries: list[dict],
|
||||||
|
) -> tuple[
|
||||||
|
list[tuple[str, bool]],
|
||||||
|
dict[str, tuple[Optional[str], Optional[str], Optional[str]]],
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Extract paths and resource meta information from a feed's entries.
|
||||||
|
|
||||||
|
Return paths in a structure wanted by :func:`add_site_paths` and
|
||||||
|
resource meta information in a structure wanted by
|
||||||
|
:func:`update_resource_meta`.
|
||||||
|
"""
|
||||||
|
paths = []
|
||||||
|
resource_meta = {}
|
||||||
|
for entry in entries:
|
||||||
|
if entry.get('link') and entry['link'].startswith(base_url or ''):
|
||||||
|
path = entry['link'].removeprefix(base_url or '').lstrip('/')
|
||||||
|
if len(path) <= 200:
|
||||||
|
last_update = entry.get('published_parsed')
|
||||||
|
if last_update:
|
||||||
|
last_update = datetime(*last_update[:6])
|
||||||
|
paths.append((path, True))
|
||||||
|
resource_meta[path] = (
|
||||||
|
last_update,
|
||||||
|
entry.get('title', '')[:200] or None,
|
||||||
|
entry.get('summary', '')[:2000] or None,
|
||||||
|
)
|
||||||
|
return paths, resource_meta
|
327
src/atextcrawler/resource/fetch.py
Normal file
327
src/atextcrawler/resource/fetch.py
Normal file
|
@ -0,0 +1,327 @@
|
||||||
|
"""
|
||||||
|
Access to a resource specified by a URL.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import logging
|
||||||
|
from json import loads
|
||||||
|
from traceback import format_exc
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from ..models import (
|
||||||
|
Feed,
|
||||||
|
MetaResource,
|
||||||
|
ResourceError,
|
||||||
|
ResourceRedirect,
|
||||||
|
Site,
|
||||||
|
TextResource,
|
||||||
|
)
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
from ..utils.link import in_blacklist
|
||||||
|
from .document import parse_document
|
||||||
|
from .feed import parse_json_feed, parse_xml_feed
|
||||||
|
from .page import parse_html
|
||||||
|
from .plaintext import parse_plaintext
|
||||||
|
from .sitemap import parse_sitemap, parse_sitemapindex
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
MAX_REDIRECTS = 10
|
||||||
|
"""
|
||||||
|
Maximum number of redirects to follow.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
default_headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)'
|
||||||
|
' Gecko/20100101 Firefox/78.0',
|
||||||
|
'DNT': '1',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.5, *;q=0.5',
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
Default HTTP client headers, overwriting those of aiohttp.ClientSession.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
blacklist_content_types = [
|
||||||
|
'',
|
||||||
|
'application/ogg',
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
Blacklist for content-types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
text_content_types = {
|
||||||
|
'text/html': 'html',
|
||||||
|
'text/plain': 'plain',
|
||||||
|
'application/rss+xml': 'feed-rss',
|
||||||
|
'application/atom+xml': 'feed-atom',
|
||||||
|
'application/feed+json': 'feed-json',
|
||||||
|
'application/json': 'json',
|
||||||
|
'application/xml': 'xml',
|
||||||
|
'text/xml': 'xml',
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
Map content-types to parsers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class ResourceFetcher:
|
||||||
|
"""
|
||||||
|
Fetch a resource specified by a URL (:meth:`fetch`).
|
||||||
|
|
||||||
|
The timeout is the same for all requests.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
session: aiohttp.ClientSession,
|
||||||
|
timeout_sock_connect: Union[int, float] = 8,
|
||||||
|
timeout_sock_read: Union[int, float] = 30,
|
||||||
|
):
|
||||||
|
self.session = session
|
||||||
|
self.timeout = aiohttp.ClientTimeout(
|
||||||
|
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
|
||||||
|
)
|
||||||
|
|
||||||
|
async def fetch(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
site: Optional[Site] = None,
|
||||||
|
redirect_history: Optional[list[str]] = None,
|
||||||
|
headers: Optional[dict] = None,
|
||||||
|
) -> Union[
|
||||||
|
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Try to fetch a resource and return an instance or error or redirect.
|
||||||
|
|
||||||
|
If an error was encountered, return a ResourceError.
|
||||||
|
If the resource has an irrelevant content type, return None.
|
||||||
|
Otherwise return a specific content instance.
|
||||||
|
|
||||||
|
Argument *redirect_history* contains the redirect history;
|
||||||
|
if one of the redirects is encountered again, return None.
|
||||||
|
"""
|
||||||
|
if redirect_history is None:
|
||||||
|
redirect_history = []
|
||||||
|
if not (durl := await Durl(url)):
|
||||||
|
return ResourceError('Invalid URL')
|
||||||
|
resp = await self.get_resp(
|
||||||
|
durl,
|
||||||
|
redirect_history=redirect_history,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
if isinstance(resp, ResourceError):
|
||||||
|
return resp
|
||||||
|
if resp is None:
|
||||||
|
return None
|
||||||
|
result = await self._parse(durl, site, resp)
|
||||||
|
if isinstance(result, (MetaResource, TextResource)):
|
||||||
|
result.id_ = None
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def _parse(
|
||||||
|
self, durl, site, resp, in_recursion=False
|
||||||
|
) -> Union[
|
||||||
|
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Parse a response. May call itself.
|
||||||
|
"""
|
||||||
|
result: Union[
|
||||||
|
None, MetaResource, TextResource, ResourceError, ResourceRedirect
|
||||||
|
] = None
|
||||||
|
content = resp['content']
|
||||||
|
if isinstance(content, str) and content.startswith('<?xml '):
|
||||||
|
result = await parse_xml(durl, resp)
|
||||||
|
elif resp['parser'] == 'feed-rss':
|
||||||
|
result = await parse_xml(durl, resp, rss=True)
|
||||||
|
elif resp['parser'] == 'feed-atom':
|
||||||
|
result = await parse_xml(durl, resp, atom=True)
|
||||||
|
elif resp['parser'] == 'xml':
|
||||||
|
result = await parse_xml(durl, resp)
|
||||||
|
elif resp['parser'] == 'html':
|
||||||
|
result = await parse_html(durl, resp, site)
|
||||||
|
elif resp['parser'] in ('json', 'feed-json'):
|
||||||
|
result = await parse_json(durl, resp)
|
||||||
|
elif resp['parser'] == 'plain':
|
||||||
|
result = await parse_plaintext(durl, resp, site)
|
||||||
|
elif resp['parser'] == 'application':
|
||||||
|
if resp['headers'].get('content-type') == 'application/x-gzip':
|
||||||
|
if in_recursion:
|
||||||
|
return None # consider nested gzip an attack
|
||||||
|
resp['content'] = gzip.decompress(resp['content'])
|
||||||
|
return await self._parse(durl, site, resp, in_recursion=True)
|
||||||
|
result = await parse_document(durl, resp, site)
|
||||||
|
if isinstance(result, ResourceRedirect):
|
||||||
|
redir_url = result.urls[-1]
|
||||||
|
result = await self.fetch(
|
||||||
|
redir_url,
|
||||||
|
site=site,
|
||||||
|
redirect_history=result.urls[:-1],
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def get_resp(
|
||||||
|
self,
|
||||||
|
durl: Durl,
|
||||||
|
headers: dict = None,
|
||||||
|
redirect_history: Optional[list[str]] = None,
|
||||||
|
) -> Optional[Union[ResourceError, dict]]:
|
||||||
|
"""
|
||||||
|
Try to fetch a url returning a ResourceError or a dict with content.
|
||||||
|
|
||||||
|
Optional *headers* will overwrite the :var:`default_headers`.
|
||||||
|
|
||||||
|
If the response status is not 200, always return an ResourceError.
|
||||||
|
|
||||||
|
If the content-type is not relevant (see blacklist_content_types),
|
||||||
|
return None.
|
||||||
|
|
||||||
|
The dict contains these keys+values:
|
||||||
|
|
||||||
|
* 'parser': a hint on the parser to use for analyzing the content;
|
||||||
|
one of 'html', 'plain', 'feed', 'xml', 'application'
|
||||||
|
* 'content': bytes for type application, otherwise str
|
||||||
|
* 'redirects': a list of URLs visited during HTTP redirection,
|
||||||
|
the last item is the final URL
|
||||||
|
* 'headers': response headers
|
||||||
|
"""
|
||||||
|
if redirect_history is None:
|
||||||
|
redirect_history = []
|
||||||
|
if len(redirect_history) >= MAX_REDIRECTS:
|
||||||
|
return None
|
||||||
|
headers_ = default_headers.copy()
|
||||||
|
if headers:
|
||||||
|
headers_.update(headers)
|
||||||
|
try:
|
||||||
|
async with self.session.get(
|
||||||
|
durl.url(),
|
||||||
|
headers=headers_,
|
||||||
|
timeout=self.timeout,
|
||||||
|
) as resp:
|
||||||
|
redirects = [durl.url()]
|
||||||
|
if resp.history:
|
||||||
|
href = resp.history[-1].headers.get('location')
|
||||||
|
if not href or not (redurl := await Durl(href, base=durl)):
|
||||||
|
msg = 'Invalid URL after HTTP redirect'
|
||||||
|
return ResourceError(msg)
|
||||||
|
if in_blacklist(redurl.hostname):
|
||||||
|
src_url = (
|
||||||
|
redirect_history[0]
|
||||||
|
if redirect_history
|
||||||
|
else durl.url()
|
||||||
|
)
|
||||||
|
msg = (
|
||||||
|
f'Dropping URL {src_url}, since'
|
||||||
|
f' redirected to a blacklisted site'
|
||||||
|
)
|
||||||
|
logger.debug(msg)
|
||||||
|
return None
|
||||||
|
redirects = [str(r.url) for r in resp.history]
|
||||||
|
redirects.append(redurl.url())
|
||||||
|
if join := set(redirect_history) & set(redirects):
|
||||||
|
msg = f'Cyclic redirect {join}'
|
||||||
|
return ResourceError(msg)
|
||||||
|
if resp.status != 200:
|
||||||
|
msg = f'HTTP status {resp.status}'
|
||||||
|
return ResourceError(
|
||||||
|
msg, status=resp.status, headers=headers
|
||||||
|
)
|
||||||
|
c_type = resp.headers.get('content-type', '').split(';')[0]
|
||||||
|
if c_type in blacklist_content_types:
|
||||||
|
return None
|
||||||
|
result: dict[str, Any] = {
|
||||||
|
'redirects': redirect_history + redirects,
|
||||||
|
'headers': resp.headers,
|
||||||
|
}
|
||||||
|
if c_type in text_content_types.keys():
|
||||||
|
try: # catch decoding issues
|
||||||
|
content = await resp.text()
|
||||||
|
except:
|
||||||
|
body = await resp.read()
|
||||||
|
encoding = resp.charset or 'utf-8'
|
||||||
|
encoding = encoding.replace('CP-1250', 'cp1250')
|
||||||
|
content = body.decode(encoding, errors='replace')
|
||||||
|
result['content'] = content
|
||||||
|
result['parser'] = text_content_types[c_type]
|
||||||
|
return result
|
||||||
|
elif c_type.startswith('application/'):
|
||||||
|
result['content'] = await resp.read()
|
||||||
|
result['parser'] = 'application'
|
||||||
|
return result
|
||||||
|
except aiohttp.ClientError as error:
|
||||||
|
# on certificate error try without tls
|
||||||
|
if 'SSLCertVerificationError' in str(error):
|
||||||
|
if durl.scheme == 'https':
|
||||||
|
url = durl.url()
|
||||||
|
durl.replace_scheme('http')
|
||||||
|
response = await self.get_resp(
|
||||||
|
durl=durl,
|
||||||
|
headers=headers,
|
||||||
|
redirect_history=redirect_history + [url],
|
||||||
|
)
|
||||||
|
if not isinstance(response, ResourceError):
|
||||||
|
return response
|
||||||
|
msg = f'ClientError: {error}'
|
||||||
|
return ResourceError(msg)
|
||||||
|
except Exception as error:
|
||||||
|
msg = f'Unknown error: {error}:\n{format_exc()}'
|
||||||
|
logger.error(msg)
|
||||||
|
return ResourceError(msg)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_xml(
|
||||||
|
durl: Durl,
|
||||||
|
response: dict,
|
||||||
|
rss=False,
|
||||||
|
atom=False,
|
||||||
|
) -> Optional[Union[MetaResource, ResourceError]]:
|
||||||
|
"""
|
||||||
|
Parse XML content.
|
||||||
|
|
||||||
|
In particular, parse sitemapindex, sitemap, RSS feed, atom feed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
xml = response['content']
|
||||||
|
soup = BeautifulSoup(xml, 'html.parser')
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
if rss or (rss := soup.find('rss')):
|
||||||
|
return parse_xml_feed(response)
|
||||||
|
elif atom or (atom := soup.find('atom')):
|
||||||
|
return parse_xml_feed(response)
|
||||||
|
elif sitemapindex := soup.find('sitemapindex'):
|
||||||
|
return parse_sitemapindex(sitemapindex)
|
||||||
|
elif urlset := soup.find('urlset'):
|
||||||
|
return parse_sitemap(urlset)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_json(
|
||||||
|
durl: Durl,
|
||||||
|
response: dict,
|
||||||
|
) -> Optional[Union[Feed, ResourceError]]:
|
||||||
|
"""
|
||||||
|
Parse the content of JSON feeds.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = loads(response['content'])
|
||||||
|
except:
|
||||||
|
msg = f'Could not parse JSON from {durl.url()}'
|
||||||
|
logger.debug(msg)
|
||||||
|
return None
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return None
|
||||||
|
if data.get('version', '').startswith('https://jsonfeed.org/'):
|
||||||
|
return parse_json_feed(response, data)
|
||||||
|
return None
|
347
src/atextcrawler/resource/operations.py
Normal file
347
src/atextcrawler/resource/operations.py
Normal file
|
@ -0,0 +1,347 @@
|
||||||
|
"""
|
||||||
|
Operations on resources.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Sequence
|
||||||
|
|
||||||
|
from asyncpg import Connection
|
||||||
|
|
||||||
|
from ..models import (
|
||||||
|
Feed,
|
||||||
|
MetaResource,
|
||||||
|
ResourceError,
|
||||||
|
Site,
|
||||||
|
Sitemap,
|
||||||
|
SitemapIndex,
|
||||||
|
SitePath,
|
||||||
|
TextResource,
|
||||||
|
)
|
||||||
|
from ..search import delete_resource, index_resource
|
||||||
|
from ..tensorflow import TensorFlow
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
from ..utils.similarity import (
|
||||||
|
create_simhash,
|
||||||
|
search_simhash,
|
||||||
|
simhash_from_bigint,
|
||||||
|
simhash_to_bigint,
|
||||||
|
)
|
||||||
|
from .feed import convert_feed_entries
|
||||||
|
from .fetch import ResourceFetcher
|
||||||
|
from .sitemap import extract_sitemap_paths
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def add_site_paths(
|
||||||
|
conn: Connection,
|
||||||
|
site_id: int,
|
||||||
|
paths: Sequence[tuple[str, Optional[bool]]],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Add site paths. if resource infos are given, also create resources.
|
||||||
|
|
||||||
|
The paths must be given as relative paths and together with a boolean
|
||||||
|
telling whether the link is a canonical link.
|
||||||
|
"""
|
||||||
|
sql = (
|
||||||
|
"INSERT INTO site_path (site_id, path, canonical)"
|
||||||
|
" VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING"
|
||||||
|
)
|
||||||
|
values = (
|
||||||
|
(site_id, path, canonical)
|
||||||
|
for path, canonical in paths[:100000]
|
||||||
|
if len(path) <= 400
|
||||||
|
)
|
||||||
|
await conn.executemany(sql, values)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_resource_meta(
|
||||||
|
conn: Connection,
|
||||||
|
site_id: int,
|
||||||
|
resource_meta: dict,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Update meta information of existing resources using path to find them.
|
||||||
|
"""
|
||||||
|
sql = (
|
||||||
|
"UPDATE resource SET last_change=coalesce($1, last_change),"
|
||||||
|
" title=coalesce($2, title), summary=coalesce($3, summary) FROM ("
|
||||||
|
" SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5"
|
||||||
|
") sp WHERE resource.id=sp.resource_id"
|
||||||
|
)
|
||||||
|
values = ((*meta, site_id, path) for path, meta in resource_meta.items())
|
||||||
|
await conn.executemany(sql, values)
|
||||||
|
|
||||||
|
|
||||||
|
async def store_feed_entries(
|
||||||
|
conn: Connection,
|
||||||
|
site: Site,
|
||||||
|
entries: list[dict],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Add missing resources of a site from given feed entries.
|
||||||
|
"""
|
||||||
|
if site.id_:
|
||||||
|
paths, resource_meta = convert_feed_entries(site.base_url, entries)
|
||||||
|
await add_site_paths(conn, site.id_, paths)
|
||||||
|
await update_resource_meta(conn, site.id_, resource_meta)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_site_path(
|
||||||
|
conn: Connection,
|
||||||
|
site: Site,
|
||||||
|
before: datetime,
|
||||||
|
only_new=False,
|
||||||
|
) -> Optional[SitePath]:
|
||||||
|
"""
|
||||||
|
Return the next path of a given site that needs to be processed.
|
||||||
|
|
||||||
|
If none needs to be processed, return None.
|
||||||
|
|
||||||
|
Only return paths that have last been visited before *before*
|
||||||
|
or not been processed at all. Paths with a ok_count of -3 or lower
|
||||||
|
are dropped.
|
||||||
|
|
||||||
|
If *only_new*, limit to paths that have not been processed at all,
|
||||||
|
irrespective of the value of *before*.
|
||||||
|
"""
|
||||||
|
if only_new:
|
||||||
|
sql = (
|
||||||
|
"SELECT * FROM site_path"
|
||||||
|
" WHERE site_id=$1 AND last_visit is null LIMIT 1"
|
||||||
|
) # implicitly canonical=null
|
||||||
|
row = await conn.fetchrow(sql, site.id_)
|
||||||
|
else:
|
||||||
|
sql = (
|
||||||
|
"SELECT * FROM site_path"
|
||||||
|
" WHERE site_id=$1 AND canonical IS NOT false AND"
|
||||||
|
" (last_visit is null OR last_visit<$2) AND"
|
||||||
|
" ok_count > -3 LIMIT 1"
|
||||||
|
) # canonical can be true or null
|
||||||
|
row = await conn.fetchrow(sql, site.id_, before)
|
||||||
|
if row:
|
||||||
|
return await SitePath().load_from_row(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def process_site_path(
|
||||||
|
app,
|
||||||
|
worker_number: int,
|
||||||
|
conn: Connection,
|
||||||
|
fetcher: ResourceFetcher,
|
||||||
|
tf: TensorFlow,
|
||||||
|
site: Site,
|
||||||
|
site_path: SitePath,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Fetch a path, deduplicate and if canonical, update and index the resource.
|
||||||
|
|
||||||
|
Return whether a new resource was handled that should contribute be
|
||||||
|
statistics.
|
||||||
|
"""
|
||||||
|
msg = (
|
||||||
|
f'Worker {worker_number} processing site {site.id_}'
|
||||||
|
f' site_path {site_path.id_} {site.base_url}{site_path.path}'
|
||||||
|
)
|
||||||
|
logger.debug(msg)
|
||||||
|
if not site.id_: # only to satisfy typing
|
||||||
|
return False
|
||||||
|
|
||||||
|
# fetch url
|
||||||
|
site_path.last_visit = datetime.utcnow()
|
||||||
|
url = site_path.url(site)
|
||||||
|
resource = await fetcher.fetch(url, site=site)
|
||||||
|
|
||||||
|
# handle failure (possibly deleting old information)
|
||||||
|
if not isinstance(resource, (TextResource, MetaResource)):
|
||||||
|
if not resource: # irrelevant content-type
|
||||||
|
site_path.ok_count = -10
|
||||||
|
elif isinstance(resource, ResourceError):
|
||||||
|
site_path.ok_count -= 1
|
||||||
|
if site_path.ok_count <= -3 and site_path.resource_id:
|
||||||
|
await site_path.unlink_resource(
|
||||||
|
conn,
|
||||||
|
app.search_engine,
|
||||||
|
app.config['elasticsearch']['index_base_name'],
|
||||||
|
)
|
||||||
|
await site_path.save(conn)
|
||||||
|
if resource: # relevant content-type
|
||||||
|
msg = (
|
||||||
|
f'Worker {worker_number} failed to process site_path'
|
||||||
|
f' {site_path.id_} (site {site.id_},'
|
||||||
|
f' {site.base_url}{site_path.path})'
|
||||||
|
)
|
||||||
|
logger.info(msg)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# handle MetaResources
|
||||||
|
if isinstance(resource, MetaResource):
|
||||||
|
if isinstance(resource, Feed):
|
||||||
|
resource.site_id = site.id_
|
||||||
|
await resource.save(conn)
|
||||||
|
if resource.entries:
|
||||||
|
await store_feed_entries(conn, site, resource.entries)
|
||||||
|
elif isinstance(resource, Sitemap):
|
||||||
|
paths, _ = extract_sitemap_paths(site.base_url, resource.urls)
|
||||||
|
await add_site_paths(conn, site.id_, paths)
|
||||||
|
elif isinstance(resource, SitemapIndex):
|
||||||
|
for sitemap_dict in resource.sitemaps:
|
||||||
|
url = sitemap_dict['loc']
|
||||||
|
res_sitemap = await fetcher.fetch(url, site=site)
|
||||||
|
if isinstance(res_sitemap, Sitemap):
|
||||||
|
paths, _ = extract_sitemap_paths(
|
||||||
|
site.base_url, res_sitemap.urls
|
||||||
|
)
|
||||||
|
await add_site_paths(conn, site.id_, paths)
|
||||||
|
return False
|
||||||
|
|
||||||
|
# handle TextResource
|
||||||
|
relevant, is_new_resource = await _handle_text_resource(
|
||||||
|
app, conn, tf, site, site_path, resource, url
|
||||||
|
)
|
||||||
|
if not relevant:
|
||||||
|
return False
|
||||||
|
site_path.resource_id = resource.id_
|
||||||
|
site_path.canonical = resource.init_fields.get('canonical')
|
||||||
|
site_path.ok_count += 1
|
||||||
|
await site_path.save(conn)
|
||||||
|
|
||||||
|
if shortlink_url := resource.init_fields.get('shortlink'):
|
||||||
|
await _save_shortlink(
|
||||||
|
conn, site, url, resource, shortlink_url, site_path.last_visit
|
||||||
|
)
|
||||||
|
|
||||||
|
return is_new_resource
|
||||||
|
|
||||||
|
|
||||||
|
async def _handle_text_resource(
|
||||||
|
app, conn, tf, site, site_path, resource, url
|
||||||
|
) -> tuple[bool, bool]:
|
||||||
|
"""
|
||||||
|
Ingest a text resource.
|
||||||
|
|
||||||
|
Return whether the resource is relevant and whether it is new.
|
||||||
|
"""
|
||||||
|
# save the resource's internal links
|
||||||
|
paths = []
|
||||||
|
if links_int := resource.init_fields['links_int']:
|
||||||
|
for durl, (rel, _) in links_int.items():
|
||||||
|
rp_filter = app.plugins['filter_resource_path'].rp_filter
|
||||||
|
if path := rp_filter(site, durl):
|
||||||
|
canon = (rel and rel.lower() == 'canonical') or None
|
||||||
|
paths.append((path, canon))
|
||||||
|
await add_site_paths(conn, site.id_, paths)
|
||||||
|
|
||||||
|
# find resources similar to the current text
|
||||||
|
text = resource.search_fields['text']
|
||||||
|
if len(text) < 300: # discard resources with too short texts
|
||||||
|
site_path.resource_id = None
|
||||||
|
await site_path.save(conn)
|
||||||
|
return False, False
|
||||||
|
simhash = simhash_from_bigint(resource.simhash)
|
||||||
|
index = site.simhash_index
|
||||||
|
similar_ids = search_simhash(index, simhash)
|
||||||
|
|
||||||
|
# determine the destination resource and resources to be merged into it
|
||||||
|
old_id = site_path.resource_id
|
||||||
|
if (
|
||||||
|
old_id
|
||||||
|
and old_id in similar_ids
|
||||||
|
and ( # similar to old text
|
||||||
|
dest_resource := await TextResource().load(conn, old_id)
|
||||||
|
)
|
||||||
|
):
|
||||||
|
merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
|
||||||
|
else: # no old text, or old text not similar any more
|
||||||
|
if old_id:
|
||||||
|
await site_path.unlink_resource(
|
||||||
|
conn,
|
||||||
|
app.search_engine,
|
||||||
|
app.config['elasticsearch']['index_base_name'],
|
||||||
|
)
|
||||||
|
# find the first existing similar resource
|
||||||
|
for similar_id in similar_ids:
|
||||||
|
dest_resource = await TextResource().load(conn, similar_id)
|
||||||
|
if dest_resource:
|
||||||
|
# also require similar length
|
||||||
|
l1 = len(resource.search_fields['text'])
|
||||||
|
l2 = dest_resource.text_len
|
||||||
|
if 0.95 * l2 <= l1 <= 1.05 * l2:
|
||||||
|
merge_ids = list(
|
||||||
|
filter(lambda elem: elem != similar_id, similar_ids)
|
||||||
|
)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
dest_resource = None
|
||||||
|
merge_ids = []
|
||||||
|
|
||||||
|
# update or create the destination resource
|
||||||
|
if dest_resource:
|
||||||
|
is_new_resource = False
|
||||||
|
resource.simhash = create_simhash(index, dest_resource.id_, simhash)
|
||||||
|
await dest_resource.update_from_resource(resource)
|
||||||
|
resource = dest_resource
|
||||||
|
else:
|
||||||
|
is_new_resource = True
|
||||||
|
resource.simhash = simhash_to_bigint(simhash)
|
||||||
|
await resource.save(conn)
|
||||||
|
create_simhash(index, resource.id_, simhash)
|
||||||
|
|
||||||
|
# add resource to search index
|
||||||
|
if resource.content_type in ('html', 'plain'):
|
||||||
|
await index_resource(
|
||||||
|
app.search_engine,
|
||||||
|
tf,
|
||||||
|
site_path,
|
||||||
|
resource,
|
||||||
|
site.base_url,
|
||||||
|
url,
|
||||||
|
)
|
||||||
|
|
||||||
|
# merge resources: merge_ids -> resource
|
||||||
|
for merge_id in merge_ids:
|
||||||
|
# replace links to the merge resource with links to the dest resource
|
||||||
|
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
|
||||||
|
await conn.execute(sql, resource.id_ or None, merge_id)
|
||||||
|
# remove orphaned merge resource
|
||||||
|
sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
|
||||||
|
found = await conn.fetchval(sql, merge_id)
|
||||||
|
if found:
|
||||||
|
await delete_resource(
|
||||||
|
app.search_engine,
|
||||||
|
found[1],
|
||||||
|
merge_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
return True, is_new_resource
|
||||||
|
|
||||||
|
|
||||||
|
async def _save_shortlink(
|
||||||
|
conn, site, url, resource, shortlink_url, last_visit
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Save a shortlink.
|
||||||
|
"""
|
||||||
|
shortlink_durl = await Durl(shortlink_url, base=site.base_url)
|
||||||
|
if shortlink_durl and shortlink_url != url:
|
||||||
|
sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2"
|
||||||
|
sl_path = shortlink_durl.pwa()
|
||||||
|
row = await conn.fetchrow(sql, site.id_, sl_path)
|
||||||
|
shortlink = await SitePath().load_from_row(row)
|
||||||
|
if not shortlink:
|
||||||
|
shortlink = SitePath(
|
||||||
|
site_id=site.id_,
|
||||||
|
path=sl_path,
|
||||||
|
last_visit=last_visit,
|
||||||
|
ok_count=1,
|
||||||
|
canonical=False,
|
||||||
|
resource_id=resource.id_,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
shortlink.last_visit = last_visit
|
||||||
|
shortlink.ok_count += 1
|
||||||
|
shortlink.canonical = False
|
||||||
|
shortlink.resource_id = resource.id_
|
||||||
|
await shortlink.save(conn)
|
355
src/atextcrawler/resource/page.py
Normal file
355
src/atextcrawler/resource/page.py
Normal file
|
@ -0,0 +1,355 @@
|
||||||
|
"""
|
||||||
|
Parse HTML pages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from copy import deepcopy
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from tidylib import tidy_document
|
||||||
|
|
||||||
|
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
||||||
|
from ..utils.annotation import (
|
||||||
|
annotate,
|
||||||
|
annotations_remove_section,
|
||||||
|
clean_annotations,
|
||||||
|
get_tag_counts,
|
||||||
|
headline_probability,
|
||||||
|
)
|
||||||
|
from ..utils.date_finder import extract_latest_date
|
||||||
|
from ..utils.durl import Durl, assort_links
|
||||||
|
from ..utils.html import (
|
||||||
|
clean_body,
|
||||||
|
clean_page,
|
||||||
|
extract_title,
|
||||||
|
get_html_lang,
|
||||||
|
get_html_redirect,
|
||||||
|
)
|
||||||
|
from ..utils.http import get_header_links
|
||||||
|
from ..utils.lang import extract_content_language
|
||||||
|
from ..utils.section import iter_sections
|
||||||
|
from ..utils.tag import keep_tags
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger_debug = logging.getLogger(__name__ + '.debug')
|
||||||
|
logger_debug.setLevel(logging.INFO)
|
||||||
|
logger_links = logging.getLogger(__name__ + '.debug.links')
|
||||||
|
logger_stats = logging.getLogger(__name__ + '.debug.stats')
|
||||||
|
logger_sections = logging.getLogger(__name__ + '.debug.sections')
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_html(
|
||||||
|
durl: Durl,
|
||||||
|
resp: dict,
|
||||||
|
site: Optional[Site],
|
||||||
|
) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]:
|
||||||
|
"""
|
||||||
|
Extract relevant data from a response returning a TextResource instance.
|
||||||
|
|
||||||
|
The given URL must be the full URL (incl. scheme and netloc) of the page.
|
||||||
|
"""
|
||||||
|
html = resp['content']
|
||||||
|
|
||||||
|
# follow link to canonical URL
|
||||||
|
header_links = await get_header_links(resp['headers'], durl, site)
|
||||||
|
if canonical := header_links.get('canonical'):
|
||||||
|
if canonical != durl.url():
|
||||||
|
return ResourceRedirect(resp['redirects'] + [canonical])
|
||||||
|
|
||||||
|
# follow html redirect, if present
|
||||||
|
if redir_url := get_html_redirect(html):
|
||||||
|
if redir_url not in resp['redirects']:
|
||||||
|
return ResourceRedirect(resp['redirects'] + [redir_url])
|
||||||
|
else:
|
||||||
|
msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}'
|
||||||
|
return ResourceError(msg)
|
||||||
|
|
||||||
|
# require html tag
|
||||||
|
if not html[:14].lower().startswith('<!doctype html'):
|
||||||
|
if '<html' not in html:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# real URL after redirection
|
||||||
|
url = resp['redirects'][-1]
|
||||||
|
durl = await Durl(url)
|
||||||
|
if not durl:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# page title
|
||||||
|
title = extract_title(html)
|
||||||
|
|
||||||
|
# tidy html
|
||||||
|
try:
|
||||||
|
html, _ = tidy_document(
|
||||||
|
html.encode('utf-8'),
|
||||||
|
options={
|
||||||
|
'logical-emphasis': 1,
|
||||||
|
'merge-divs': 1,
|
||||||
|
'merge-spans': 1,
|
||||||
|
'hide-comments': 1,
|
||||||
|
'output-bom': 0,
|
||||||
|
'show-errors': 0,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
html = html.decode('utf-8')
|
||||||
|
except:
|
||||||
|
msg = f'Cannot tidy html from {url}'
|
||||||
|
return ResourceError(msg)
|
||||||
|
|
||||||
|
# drop irrelevant tags, including their contents
|
||||||
|
soup = clean_page(html)
|
||||||
|
|
||||||
|
# extract shortlink (from http headers or html head)
|
||||||
|
shortlink = header_links.get('shortlink')
|
||||||
|
if not shortlink and soup.head:
|
||||||
|
for link in soup.head.find_all('link'):
|
||||||
|
if 'shortlink' in link.get('rel', ''):
|
||||||
|
if link.get('href'):
|
||||||
|
shortlink = link.get('href')
|
||||||
|
break
|
||||||
|
|
||||||
|
# language, plaintext, annotations, last change
|
||||||
|
lang = get_html_lang(html)
|
||||||
|
html = clean_body(str(soup.body))
|
||||||
|
head = soup.head
|
||||||
|
text, annotations = annotate(html)
|
||||||
|
if lng := extract_content_language(text):
|
||||||
|
lang = lng
|
||||||
|
last_change = extract_latest_date(html, lang=lang)
|
||||||
|
|
||||||
|
# assort internal and external links
|
||||||
|
base_url = None
|
||||||
|
if head and head.base:
|
||||||
|
base_url = head.base.get('href')
|
||||||
|
if not base_url and site:
|
||||||
|
base_url = site.base_url
|
||||||
|
cleaned_links, links_int, links_ext = await assort_links(
|
||||||
|
annotations['links'], durl, text, base_url
|
||||||
|
)
|
||||||
|
annotations['links'] = cleaned_links
|
||||||
|
if logger_links.isEnabledFor(logging.DEBUG):
|
||||||
|
logger_links.debug('==== internal links')
|
||||||
|
for durl_, txt in links_int.items():
|
||||||
|
logger_links.debug(f'{durl_.url()} {txt}')
|
||||||
|
logger_links.debug('==== external links')
|
||||||
|
for durl_, txt in links_ext.items():
|
||||||
|
logger_links.debug(f'{durl_.url()} {txt}')
|
||||||
|
|
||||||
|
# keywords from category links
|
||||||
|
category_links = set()
|
||||||
|
for href, (i, f, rel) in annotations['links'].items():
|
||||||
|
if rel and ('category' in rel or 'tag' in rel):
|
||||||
|
category_links.add(text[i:f])
|
||||||
|
keywords = sorted(category_links)
|
||||||
|
|
||||||
|
# filter out irrelevant sections
|
||||||
|
filtered_text, filtered_ann = filter_sections(
|
||||||
|
text, annotations, site.boilerplate_texts if site else None
|
||||||
|
)
|
||||||
|
|
||||||
|
# debug statistics
|
||||||
|
if logger_stats.isEnabledFor(logging.DEBUG):
|
||||||
|
sb = annotations['semantic_breaks']
|
||||||
|
fsb = filtered_ann['semantic_breaks']
|
||||||
|
logger_stats.debug(
|
||||||
|
f'Page statistics:'
|
||||||
|
f' html_len={len(html)} text_len={len(filtered_text)}'
|
||||||
|
f' ratio={len(filtered_text) / len(html):.2f};'
|
||||||
|
f' sections={len(sb)} filtered_sections={len(fsb)}'
|
||||||
|
f' ratio={len(fsb) / len(sb):.2f} url={durl.url()}'
|
||||||
|
)
|
||||||
|
|
||||||
|
return TextResource(
|
||||||
|
content_type='html',
|
||||||
|
last_change=last_change,
|
||||||
|
text_len=len(text),
|
||||||
|
lang=lang,
|
||||||
|
title=title,
|
||||||
|
init_fields={
|
||||||
|
'durl': durl,
|
||||||
|
'site': site,
|
||||||
|
'headers': resp['headers'],
|
||||||
|
'redirects': resp['redirects'],
|
||||||
|
'links_int': links_int,
|
||||||
|
'links_ext': links_ext,
|
||||||
|
'shortlink': shortlink,
|
||||||
|
'canonical': True if canonical else None,
|
||||||
|
'head': head,
|
||||||
|
},
|
||||||
|
search_fields={
|
||||||
|
'title': title,
|
||||||
|
'pub_date': last_change,
|
||||||
|
'keywords': keywords,
|
||||||
|
'text': filtered_text,
|
||||||
|
'annotations': filtered_ann,
|
||||||
|
'head': str(head),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_sections(text, annotations, boilerplate_texts):
|
||||||
|
"""
|
||||||
|
Filter out irrelevant sections using scores and factoring in neighbors.
|
||||||
|
"""
|
||||||
|
tags = annotations['tags']
|
||||||
|
sb = annotations['semantic_breaks']
|
||||||
|
section_ids = annotations['section_ids']
|
||||||
|
|
||||||
|
# for i1,f1 in sorted(tags.keys()):
|
||||||
|
# print(' ', i1,f1,tags[(i1,f1)], text[i1:f1])
|
||||||
|
# for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
||||||
|
# print('-' * lvl, i,f,','.join(tags[(i+1, f)]), sb[i], txt)
|
||||||
|
# print('_' * 50)
|
||||||
|
# from pprint import pprint
|
||||||
|
# pprint(sb)
|
||||||
|
# pprint(tags)
|
||||||
|
# pprint(section_ids)
|
||||||
|
|
||||||
|
# calculate keep scores for sections
|
||||||
|
# negative scores mean: drop; positive scores mean keep;
|
||||||
|
# scores between -2 and 2 are undecided
|
||||||
|
sections_keep = {}
|
||||||
|
headline_probs = {}
|
||||||
|
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
||||||
|
if prob := headline_probability(txt, tags[(i, f)], lvl):
|
||||||
|
headline_probs[(i, f)] = prob
|
||||||
|
w = 0
|
||||||
|
n_chars = f - i - 1
|
||||||
|
# string length
|
||||||
|
w = (n_chars - 80) / 80 # initial weight
|
||||||
|
# punctuation
|
||||||
|
w += 0.4 * text.count('.') + 0.1 * text.count(',')
|
||||||
|
# p tag
|
||||||
|
if 'p' in tags[(i + 1, f)]: # prefer keeping paragraphs
|
||||||
|
w += 0.7
|
||||||
|
# links
|
||||||
|
n_links, link_density, avg_text_len = get_tag_counts(
|
||||||
|
('a',), i, f, tags, text
|
||||||
|
)
|
||||||
|
if link_density > 0.5:
|
||||||
|
w = -n_links
|
||||||
|
elif link_density > 0.3 and avg_text_len < 60:
|
||||||
|
w = -3
|
||||||
|
else:
|
||||||
|
n_li, li_density, li_len = get_tag_counts(
|
||||||
|
('li',), i, f, tags, text
|
||||||
|
)
|
||||||
|
if link_density > 0.2 and li_density > 0.8 and li_len < 50:
|
||||||
|
w = -3
|
||||||
|
if 52 <= lvl < 60:
|
||||||
|
w = max(w, 1.0)
|
||||||
|
if 'sidebar' in ' '.join(section_ids.get(i, [])):
|
||||||
|
w = -3
|
||||||
|
if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt):
|
||||||
|
w = -3
|
||||||
|
# special chars
|
||||||
|
if txt.startswith('←') or txt.endswith('→'): # wordpress navigation
|
||||||
|
w = -3
|
||||||
|
# remove boilerplate texts
|
||||||
|
if boilerplate_texts and txt in boilerplate_texts:
|
||||||
|
w = -10
|
||||||
|
sections_keep[(i, f)] = w, lvl
|
||||||
|
|
||||||
|
# amend keep scores: look at preceding / subsequent sections with
|
||||||
|
# equal level and transfer their keep scores to the current section
|
||||||
|
n = len(sections_keep)
|
||||||
|
sections = list(sorted(sections_keep.keys()))
|
||||||
|
# inspect subsequent sections:
|
||||||
|
for rev_ind, s_range in enumerate(reversed(sections)):
|
||||||
|
ind = n - 1 - rev_ind
|
||||||
|
w, lvl = sections_keep[s_range]
|
||||||
|
if abs(w) <= 2:
|
||||||
|
w_sum = 0
|
||||||
|
n_peers = 0
|
||||||
|
for i in range(ind + 1, min(n, ind + 15)):
|
||||||
|
w_, lvl_ = sections_keep[sections[i]]
|
||||||
|
if lvl_ != lvl:
|
||||||
|
break
|
||||||
|
n_peers += 1
|
||||||
|
w_sum += w_
|
||||||
|
if n_peers >= 3:
|
||||||
|
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
|
||||||
|
# inspect preceding sections:
|
||||||
|
for ind, s_range in enumerate(sections):
|
||||||
|
w, lvl = sections_keep[s_range]
|
||||||
|
if abs(w) <= 2:
|
||||||
|
w_sum = 0
|
||||||
|
n_peers = 0
|
||||||
|
for i in range(ind - 1, max(0, ind - 15), -1):
|
||||||
|
w_, lvl_ = sections_keep[sections[i]]
|
||||||
|
if lvl_ != lvl:
|
||||||
|
break
|
||||||
|
n_peers += 1
|
||||||
|
w_sum += w_
|
||||||
|
if n_peers >= 3:
|
||||||
|
sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl
|
||||||
|
|
||||||
|
# amend keep scores: look at sections that could be headlines
|
||||||
|
# for subsequent kept sections and increase their score;
|
||||||
|
# also allow for up to 2 sections inbetween (which will also
|
||||||
|
# have their score increased)
|
||||||
|
for rev_ind, s_range in enumerate(reversed(sections)):
|
||||||
|
ind = n - 1 - rev_ind
|
||||||
|
w, lvl = sections_keep[s_range]
|
||||||
|
if abs(w) <= 2:
|
||||||
|
if headline_probs.get(s_range, 0) > 0.49:
|
||||||
|
# look at subsequent sections with higher level
|
||||||
|
child_weights = []
|
||||||
|
for i in range(ind + 1, n):
|
||||||
|
w_, lvl_ = sections_keep[sections[i]]
|
||||||
|
if lvl_ <= lvl or w_ < -2:
|
||||||
|
break
|
||||||
|
child_weights.append(w_)
|
||||||
|
if nc := len(child_weights):
|
||||||
|
child_avg = sum(child_weights) / nc
|
||||||
|
if w + 1.2 * child_avg > 2:
|
||||||
|
sections_keep[s_range] = w + 1.2 * child_avg, lvl
|
||||||
|
if nc > 1:
|
||||||
|
if (w1 := child_weights[0]) <= 2:
|
||||||
|
sections_keep[sections[ind + 1]] = (
|
||||||
|
w1 + 1.5 * child_avg,
|
||||||
|
lvl,
|
||||||
|
)
|
||||||
|
if nc > 2:
|
||||||
|
if (w2 := child_weights[1]) <= 2:
|
||||||
|
sections_keep[sections[ind + 2]] = (
|
||||||
|
w2 + 2 * child_avg,
|
||||||
|
lvl,
|
||||||
|
)
|
||||||
|
|
||||||
|
# clean annotations
|
||||||
|
clean_annotations(annotations)
|
||||||
|
|
||||||
|
# debug sections
|
||||||
|
if logger_sections.isEnabledFor(logging.DEBUG):
|
||||||
|
logger_sections.debug('============= Weighted sections =============')
|
||||||
|
for i, f, lvl, txt in iter_sections(text, sb, max_level=60):
|
||||||
|
w, lvl = sections_keep[(i, f)]
|
||||||
|
indent = ('+' if w > 2 else '-') * lvl
|
||||||
|
ts = ','.join(tags[(i + 1, f)])
|
||||||
|
logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}')
|
||||||
|
|
||||||
|
# narrow down annotations and text to keep_sections
|
||||||
|
# drop undecided sections
|
||||||
|
filtered_text = text
|
||||||
|
filtered_ann = deepcopy(annotations)
|
||||||
|
for i, f in sorted(sections_keep.keys(), reverse=True):
|
||||||
|
w, lvl = sections_keep[(i, f)]
|
||||||
|
if w <= 2.0:
|
||||||
|
filtered_ann = annotations_remove_section(filtered_ann, i, f)
|
||||||
|
filtered_text = filtered_text[:i] + filtered_text[f:]
|
||||||
|
clean_annotations(filtered_ann)
|
||||||
|
|
||||||
|
# debug filtered sections
|
||||||
|
if logger_sections.isEnabledFor(logging.DEBUG):
|
||||||
|
logger_sections.debug('')
|
||||||
|
logger_sections.debug('============= Filtered sections =============')
|
||||||
|
fsb = filtered_ann['semantic_breaks']
|
||||||
|
ftags = filtered_ann['tags']
|
||||||
|
for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100):
|
||||||
|
indent = ' ' * lvl
|
||||||
|
ts = ','.join(ftags.get((i + 1, f), []))
|
||||||
|
logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}')
|
||||||
|
|
||||||
|
return filtered_text, filtered_ann
|
148
src/atextcrawler/resource/plaintext.py
Normal file
148
src/atextcrawler/resource/plaintext.py
Normal file
|
@ -0,0 +1,148 @@
|
||||||
|
"""
|
||||||
|
Parse plaintext pages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
import pypandoc
|
||||||
|
|
||||||
|
from ..models import ResourceError, ResourceRedirect, Site, TextResource
|
||||||
|
from ..utils.annotation import annotate
|
||||||
|
from ..utils.date_finder import extract_latest_date
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
from ..utils.http import get_header_links
|
||||||
|
from ..utils.lang import extract_content_language
|
||||||
|
from ..utils.muse import parse_muse
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
MAX_LINK_TEXT_LENGTH = 100
|
||||||
|
"""
|
||||||
|
Maximum length of a link's text to be kept.
|
||||||
|
|
||||||
|
Cf. table site_link, column link_text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
re_url = re.compile(
|
||||||
|
r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?'
|
||||||
|
r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
re_nl = re.compile(r'\r\n')
|
||||||
|
|
||||||
|
|
||||||
|
re_ws = re.compile(r'\s*\n\s*\n\s*')
|
||||||
|
|
||||||
|
|
||||||
|
re_nn = re.compile(r'\n\n')
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_plaintext(
|
||||||
|
durl: Durl,
|
||||||
|
resp: dict,
|
||||||
|
site: Optional[Site],
|
||||||
|
) -> Optional[Union[ResourceRedirect, TextResource]]:
|
||||||
|
"""
|
||||||
|
Extract relevant data from a response returning a TextResource instance.
|
||||||
|
|
||||||
|
The given URL must be the full URL (incl. scheme and netloc) of the page.
|
||||||
|
"""
|
||||||
|
text = resp['content']
|
||||||
|
|
||||||
|
# HTTP headers, canonical URL, shortlink
|
||||||
|
header_links = await get_header_links(resp['headers'], durl, site)
|
||||||
|
if canonical := header_links.get('canonical'):
|
||||||
|
if canonical != durl.url():
|
||||||
|
return ResourceRedirect(resp['redirects'] + [canonical])
|
||||||
|
shortlink = header_links.get('shortlink')
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
|
||||||
|
text = re_nl.sub('\n', text)
|
||||||
|
text = re_ws.sub('\n\n', text)
|
||||||
|
|
||||||
|
# meta info
|
||||||
|
meta: dict[str, Any] = {}
|
||||||
|
muse = None
|
||||||
|
if durl.path.endswith('.muse'):
|
||||||
|
muse = parse_muse(text)
|
||||||
|
if muse:
|
||||||
|
meta, text = muse
|
||||||
|
# title
|
||||||
|
if not meta.get('title'):
|
||||||
|
meta['title'] = text[:200].splitlines()[0]
|
||||||
|
# content language
|
||||||
|
if not meta.get('lang'):
|
||||||
|
meta['lang'] = extract_content_language(text)
|
||||||
|
# publication date
|
||||||
|
if not meta.get('pub_date'):
|
||||||
|
meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang'))
|
||||||
|
|
||||||
|
# links
|
||||||
|
links_int: dict[Durl, tuple[list[str], str]] = {}
|
||||||
|
links_ext: dict[Durl, tuple[list[str], str]] = {}
|
||||||
|
for url in re_url.findall(text):
|
||||||
|
link_durl = await Durl(url[0])
|
||||||
|
if link_durl:
|
||||||
|
if link_durl.site() == durl.site():
|
||||||
|
links_int[link_durl] = [], link_durl.url()
|
||||||
|
else:
|
||||||
|
links_ext[link_durl] = [], link_durl.url()
|
||||||
|
|
||||||
|
if muse:
|
||||||
|
html = pypandoc.convert_text(text, 'html5', format='muse').strip()
|
||||||
|
text, annotations = annotate(html)
|
||||||
|
else:
|
||||||
|
text, annotations = annotate_text(text)
|
||||||
|
|
||||||
|
return TextResource(
|
||||||
|
content_type=resp['parser'],
|
||||||
|
last_change=meta.get('pub_date'),
|
||||||
|
text_len=len(text),
|
||||||
|
lang=meta.get('lang'),
|
||||||
|
title=meta.get('title'),
|
||||||
|
init_fields={
|
||||||
|
'durl': durl,
|
||||||
|
'site': site,
|
||||||
|
'headers': resp['headers'],
|
||||||
|
'redirects': resp['redirects'],
|
||||||
|
'links_int': links_int,
|
||||||
|
'links_ext': links_ext,
|
||||||
|
'shortlink': shortlink,
|
||||||
|
'canonical': None,
|
||||||
|
},
|
||||||
|
search_fields={
|
||||||
|
'title': meta.get('title'),
|
||||||
|
'authors': meta.get('authors'),
|
||||||
|
'pub_date': meta.get('pub_date'),
|
||||||
|
'keywords': meta.get('keywords'),
|
||||||
|
'summary': meta.get('summary'),
|
||||||
|
'text': text,
|
||||||
|
'annotations': annotations,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def annotate_text(text):
|
||||||
|
"""
|
||||||
|
Return annoations as :func:`utils.annotation.annotate`does.
|
||||||
|
|
||||||
|
Here we only have information on semantic breaks
|
||||||
|
(in plaintext they are where empty lines are).
|
||||||
|
"""
|
||||||
|
semantic_breaks = {}
|
||||||
|
for match in re_nn.finditer(text):
|
||||||
|
semantic_breaks[match.span()[0]] = ''
|
||||||
|
annotations = {
|
||||||
|
'tags': {},
|
||||||
|
'semantic_breaks': semantic_breaks,
|
||||||
|
'section_ids': {},
|
||||||
|
'links': {},
|
||||||
|
}
|
||||||
|
return text, annotations
|
149
src/atextcrawler/resource/sitemap.py
Normal file
149
src/atextcrawler/resource/sitemap.py
Normal file
|
@ -0,0 +1,149 @@
|
||||||
|
"""
|
||||||
|
Sitemap and SitemapIndex and related operations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import pytz
|
||||||
|
|
||||||
|
from ..models import Sitemap, SitemapIndex, TextResource
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_sitemap_urls(
|
||||||
|
fetcher,
|
||||||
|
base_url: Optional[str],
|
||||||
|
sitemaps=None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Try to find sitemaps and fetch and return their URL content.
|
||||||
|
|
||||||
|
Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'.
|
||||||
|
"""
|
||||||
|
if sitemaps:
|
||||||
|
# test example: https://www.berlin.de/
|
||||||
|
check_all = True
|
||||||
|
elif base_url:
|
||||||
|
sitemaps = [
|
||||||
|
base_url.rstrip('/') + '/sitemap.xml',
|
||||||
|
base_url.rstrip('/') + '/wp-sitemap.xml',
|
||||||
|
base_url.rstrip('/') + '/sitemap_index.xml',
|
||||||
|
base_url.rstrip('/') + '/sitemap.xml.gz',
|
||||||
|
base_url.rstrip('/') + '/sitemap_index.xml.gz',
|
||||||
|
base_url.rstrip('/') + '/sitemap.txt',
|
||||||
|
base_url.rstrip('/') + '/sitemap/',
|
||||||
|
base_url.rstrip('/') + '/sitemap1.xml',
|
||||||
|
base_url.rstrip('/') + '/sitemap-index.xml',
|
||||||
|
base_url.rstrip('/') + '/sitemapindex.xml',
|
||||||
|
base_url.rstrip('/') + '/sitemap/index.xml',
|
||||||
|
]
|
||||||
|
check_all = False
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
urls = []
|
||||||
|
for sitemap in sitemaps:
|
||||||
|
resource = await fetcher.fetch(sitemap)
|
||||||
|
found = True
|
||||||
|
if isinstance(resource, SitemapIndex):
|
||||||
|
for sitemap_ in resource.sitemaps:
|
||||||
|
sitemaps.append(sitemap_['loc'])
|
||||||
|
elif isinstance(resource, Sitemap):
|
||||||
|
urls += resource.urls
|
||||||
|
elif isinstance(resource, TextResource) and resource.content_type in (
|
||||||
|
'html',
|
||||||
|
'plain',
|
||||||
|
):
|
||||||
|
urls += [
|
||||||
|
{'loc': durl.url()}
|
||||||
|
for durl in resource.init_fields['links_int']
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
found = False
|
||||||
|
if found and not check_all:
|
||||||
|
break
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def parse_sitemapindex(sitemapindex):
|
||||||
|
"""
|
||||||
|
Parse a sitemap index returning a `SitemapIndex` with found sitemaps.
|
||||||
|
"""
|
||||||
|
sitemaps = []
|
||||||
|
for tag in sitemapindex.find_all('sitemap'):
|
||||||
|
if loc := tag.find('loc'):
|
||||||
|
if loc.string:
|
||||||
|
sitemap = {'loc': loc.string.strip()}
|
||||||
|
if lastmod := tag.find('lastmod'):
|
||||||
|
try:
|
||||||
|
t = datetime.fromisoformat(lastmod.string.strip())
|
||||||
|
sitemap['lastmod'] = t
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
sitemaps.append(sitemap)
|
||||||
|
return SitemapIndex(sitemaps=sitemaps)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_sitemap(urlset) -> Sitemap:
|
||||||
|
"""
|
||||||
|
Return a list of sitemap URLs.
|
||||||
|
|
||||||
|
Each URL is a dict with these keys+values:
|
||||||
|
|
||||||
|
* loc: the full URL of a mapped resource
|
||||||
|
* lastmod: optional datetime of its last modification
|
||||||
|
* changefreq: optional info on the change frequency to be expected
|
||||||
|
* priority: optional info on its priority relative to other resources
|
||||||
|
|
||||||
|
Cf. https://www.sitemaps.org/protocol.html
|
||||||
|
"""
|
||||||
|
urls = []
|
||||||
|
for tag in urlset.find_all('url'):
|
||||||
|
if loc := tag.find('loc'):
|
||||||
|
if loc.string:
|
||||||
|
url = {'loc': loc.string.strip()}
|
||||||
|
if lastmod := tag.find('lastmod'):
|
||||||
|
try:
|
||||||
|
t = lastmod.string.strip().rstrip('Z')
|
||||||
|
url['lastmod'] = (
|
||||||
|
datetime.fromisoformat(t)
|
||||||
|
.astimezone(pytz.utc)
|
||||||
|
.replace(tzinfo=None)
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if changefreq := tag.find('changefreq'):
|
||||||
|
url['changefreq'] = changefreq.string.strip()
|
||||||
|
if priority := tag.find('priority'):
|
||||||
|
url['priority'] = priority.string.strip()
|
||||||
|
urls.append(url)
|
||||||
|
return Sitemap(urls=urls)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sitemap_paths(
|
||||||
|
base_url: Optional[str],
|
||||||
|
urls: list[dict],
|
||||||
|
) -> tuple[list[tuple[str, bool]], Optional[datetime]]:
|
||||||
|
"""
|
||||||
|
Extract essential information from sitemap URLs.
|
||||||
|
|
||||||
|
Return a list of relative paths of the site's resources
|
||||||
|
(in a form to be easily fed into `add_site_paths`) and
|
||||||
|
the datetime of the latest change.
|
||||||
|
|
||||||
|
Relative paths are computed using base_url.
|
||||||
|
"""
|
||||||
|
paths = []
|
||||||
|
latest = None
|
||||||
|
for url in urls:
|
||||||
|
loc = url['loc']
|
||||||
|
lastmod = url.get('lastmod')
|
||||||
|
if loc.startswith(base_url or ''):
|
||||||
|
path = loc.removeprefix(base_url or '').lstrip('/')
|
||||||
|
path = path.split('#', 1)[0]
|
||||||
|
paths.append((path, True))
|
||||||
|
if lastmod:
|
||||||
|
latest = max(lastmod, latest or lastmod)
|
||||||
|
return paths, latest
|
6
src/atextcrawler/search/__init__.py
Normal file
6
src/atextcrawler/search/__init__.py
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
from .engine import (
|
||||||
|
delete_resource,
|
||||||
|
index_resource,
|
||||||
|
shutdown_engine,
|
||||||
|
startup_engine,
|
||||||
|
)
|
270
src/atextcrawler/search/engine.py
Normal file
270
src/atextcrawler/search/engine.py
Normal file
|
@ -0,0 +1,270 @@
|
||||||
|
"""
|
||||||
|
Search engine, for now elasticsearch.
|
||||||
|
|
||||||
|
We have one index per supported language and a default one.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import warnings
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from elasticsearch import AsyncElasticsearch
|
||||||
|
from elasticsearch.exceptions import NotFoundError
|
||||||
|
|
||||||
|
from ..utils.annotation import pack_annotations
|
||||||
|
from ..utils.section import concat_section_texts
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
warnings.filterwarnings(
|
||||||
|
'ignore',
|
||||||
|
'The client is unable to verify that the'
|
||||||
|
' server is Elasticsearch due security privileges on the server side',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
MIN_INDEXING_TIMEOUT_SECONDS = 5
|
||||||
|
|
||||||
|
|
||||||
|
language_analyzers = {
|
||||||
|
'en': 'english',
|
||||||
|
'de': 'german',
|
||||||
|
#'fr': 'french',
|
||||||
|
#'el': 'greek',
|
||||||
|
#'es': 'spanish',
|
||||||
|
'default': 'standard',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
properties = {
|
||||||
|
'resource_id': {'type': 'long'},
|
||||||
|
'site_id': {'type': 'long'},
|
||||||
|
'url': {'type': 'text'},
|
||||||
|
'base_url': {'type': 'text'},
|
||||||
|
'pub_date': {'type': 'date', 'format': 'yyyy-MM-dd||yyyy-MM||yyyy'},
|
||||||
|
'lang': {'type': 'keyword'},
|
||||||
|
'title': {'type': 'text'},
|
||||||
|
'authors': {'type': 'text'},
|
||||||
|
'summary': {'type': 'text'},
|
||||||
|
'keywords': {'type': 'text'},
|
||||||
|
'collections': {'type': 'keyword'},
|
||||||
|
'time_horizon': {'type': 'keyword'},
|
||||||
|
'orig_source': {'type': 'text'},
|
||||||
|
'topics': {'type': 'text'},
|
||||||
|
'annotations': {'type': 'text', 'index': False},
|
||||||
|
'sections': {
|
||||||
|
'type': 'nested',
|
||||||
|
'properties': {
|
||||||
|
'start_ids': {'type': 'integer'},
|
||||||
|
'end_ids': {'type': 'integer'},
|
||||||
|
'text': {'type': 'text', 'index_options': 'offsets'},
|
||||||
|
'embedding': {'type': 'dense_vector', 'dims': 512},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def startup_engine(config):
|
||||||
|
"""
|
||||||
|
Open the search engine for access.
|
||||||
|
"""
|
||||||
|
engine = AsyncElasticsearch(
|
||||||
|
host=config['elasticsearch']['host'],
|
||||||
|
api_key=(
|
||||||
|
config['elasticsearch']['id'],
|
||||||
|
config['elasticsearch']['api_key'],
|
||||||
|
),
|
||||||
|
use_ssl=False,
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
|
engine.index_base_name = config['elasticsearch']['index_base_name']
|
||||||
|
await create_indices(engine)
|
||||||
|
await open_indices(engine)
|
||||||
|
return engine
|
||||||
|
|
||||||
|
|
||||||
|
async def create_indices(engine):
|
||||||
|
"""
|
||||||
|
Create indices for all configured langiages.
|
||||||
|
"""
|
||||||
|
for lang, analyzer in language_analyzers.items():
|
||||||
|
index_name = engine.index_base_name + '_text_' + lang
|
||||||
|
if not await engine.indices.exists(index=index_name):
|
||||||
|
await engine.indices.create(index=index_name)
|
||||||
|
await engine.indices.close(index=index_name)
|
||||||
|
await engine.indices.put_settings(
|
||||||
|
index=index_name,
|
||||||
|
body={
|
||||||
|
'analysis': {'analyzer': {'default': {'type': analyzer}}},
|
||||||
|
'refresh_interval': '60s',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
await engine.indices.put_mapping(
|
||||||
|
index=index_name,
|
||||||
|
body={'properties': properties},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def open_indices(engine):
|
||||||
|
"""
|
||||||
|
Open indices for all configure languages.
|
||||||
|
"""
|
||||||
|
for lang in language_analyzers.keys():
|
||||||
|
index_name = engine.index_base_name + '_text_' + lang
|
||||||
|
await engine.indices.open(index=index_name)
|
||||||
|
|
||||||
|
|
||||||
|
async def shutdown_engine(engine):
|
||||||
|
"""
|
||||||
|
Close the connection to the search engine.
|
||||||
|
"""
|
||||||
|
# await close_indices(engine)
|
||||||
|
await engine.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def close_indices(engine):
|
||||||
|
"""
|
||||||
|
Close indices. UNUSED.
|
||||||
|
"""
|
||||||
|
for lang in language_analyzers.keys():
|
||||||
|
index_name = engine.index_base_name + '_text_' + lang
|
||||||
|
await engine.indices.close(index=index_name)
|
||||||
|
|
||||||
|
|
||||||
|
async def index_resource(
|
||||||
|
engine,
|
||||||
|
tf,
|
||||||
|
site_path,
|
||||||
|
resource,
|
||||||
|
base_url,
|
||||||
|
url,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Index a resource.
|
||||||
|
"""
|
||||||
|
lang = resource.lang
|
||||||
|
index_lang = lang if lang in language_analyzers.keys() else 'default'
|
||||||
|
index_name = engine.index_base_name + '_text_' + index_lang
|
||||||
|
pub_date = resource.search_fields.get('pub_date')
|
||||||
|
if pub_date:
|
||||||
|
pub_date = str(pub_date.date())
|
||||||
|
text = resource.search_fields.get('text')
|
||||||
|
annotations = resource.search_fields.get('annotations')
|
||||||
|
semantic_breaks = annotations['semantic_breaks']
|
||||||
|
sections = []
|
||||||
|
for section_ids, txt in concat_section_texts(text, semantic_breaks):
|
||||||
|
embedding = await tf.embed(txt)
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
'start_ids': section_ids[0],
|
||||||
|
'end_ids': section_ids[-1],
|
||||||
|
'text': txt,
|
||||||
|
'embedding': embedding,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
doc = {
|
||||||
|
'resource_id': resource.id_,
|
||||||
|
'site_id': site_path.site_id,
|
||||||
|
'url': url,
|
||||||
|
'base_url': base_url,
|
||||||
|
'pub_date': pub_date,
|
||||||
|
'lang': resource.lang,
|
||||||
|
'title': resource.search_fields.get('title'),
|
||||||
|
'authors': resource.search_fields.get('authors'),
|
||||||
|
'summary': resource.search_fields.get('summary'),
|
||||||
|
'keywords': resource.search_fields.get('keywords'),
|
||||||
|
'collections': resource.search_fields.get('collections'),
|
||||||
|
'time_horizon': resource.search_fields.get('time_horizon'),
|
||||||
|
'orig_source': resource.search_fields.get('orig_source'),
|
||||||
|
'topics': resource.search_fields.get('topics'),
|
||||||
|
'annotations': pack_annotations(annotations),
|
||||||
|
'sections': sections,
|
||||||
|
}
|
||||||
|
timeout_seconds = max(MIN_INDEXING_TIMEOUT_SECONDS, int(len(text) / 1000))
|
||||||
|
await engine.index(
|
||||||
|
id=resource.id_,
|
||||||
|
index=index_name,
|
||||||
|
body=doc,
|
||||||
|
timeout=f'{timeout_seconds}s',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def delete_resource(engine, lang, resource_id):
|
||||||
|
"""
|
||||||
|
Delete a resource.
|
||||||
|
"""
|
||||||
|
index_name = engine.index_base_name + '_text_' + (lang or 'default')
|
||||||
|
try:
|
||||||
|
await engine.delete(index_name, resource_id)
|
||||||
|
except NotFoundError:
|
||||||
|
msg = f'Cannot delete resource from index, not found: {resource_id}'
|
||||||
|
logger.warning(msg)
|
||||||
|
|
||||||
|
|
||||||
|
async def find_duplicate(engine, site_id, resource) -> Union[bool, None, int]:
|
||||||
|
"""
|
||||||
|
UNUSED.
|
||||||
|
|
||||||
|
Try to find a duplicate resource with matching site.
|
||||||
|
|
||||||
|
If the search backend query fails, return False.
|
||||||
|
If no matching resource was found, return None.
|
||||||
|
If a matching resource was found, return its id.
|
||||||
|
"""
|
||||||
|
# get sample texts
|
||||||
|
text = resource.search_fields['text']
|
||||||
|
if not text or len(text) < 100:
|
||||||
|
return None
|
||||||
|
# annotations = resource.search_fields['annotations']
|
||||||
|
# semantic_breaks = annotations['semantic_breaks']
|
||||||
|
# texts = []
|
||||||
|
# for _, txt in concat_section_texts(text, semantic_breaks):
|
||||||
|
# texts.append(txt)
|
||||||
|
# texts = extract_samples(texts)
|
||||||
|
|
||||||
|
# # search for sample texts
|
||||||
|
# text_count = len(texts)
|
||||||
|
# should_min = max(1, int(0.6 * text_count))
|
||||||
|
# should = []
|
||||||
|
# for text in texts:
|
||||||
|
# should.append({'match': {'sections.text': text}})
|
||||||
|
query = {
|
||||||
|
'bool': {
|
||||||
|
'must': {
|
||||||
|
'nested': {
|
||||||
|
'path': 'sections',
|
||||||
|
'query': {'match': {'sections.text': text}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
'filter': {
|
||||||
|
'term': {
|
||||||
|
'site_id': site_id,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fields = [
|
||||||
|
'url',
|
||||||
|
'sections.text',
|
||||||
|
'site_id',
|
||||||
|
]
|
||||||
|
response = await engine.search(
|
||||||
|
index=engine.index_base_name + '_text_*',
|
||||||
|
body={
|
||||||
|
'query': query,
|
||||||
|
'fields': fields,
|
||||||
|
'from': 0,
|
||||||
|
'size': 3,
|
||||||
|
'_source': False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
if response['timed_out']:
|
||||||
|
return False
|
||||||
|
for hit in response.get('hits', {}).get('hits'):
|
||||||
|
txt = ' '.join(hit['fields']['sections.text'])
|
||||||
|
similarity = SequenceMatcher(None, text, txt).ratio()
|
||||||
|
if similarity > 0.99:
|
||||||
|
return hit['_id']
|
||||||
|
return None
|
9
src/atextcrawler/site/__init__.py
Normal file
9
src/atextcrawler/site/__init__.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
"""
|
||||||
|
Websites.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .feeds import fetch_feeds
|
||||||
|
from .operations import checkin_site, checkout_site, process_site, update_site
|
||||||
|
from .queue import process_site_queue
|
||||||
|
from .robots import RobotsInfo
|
||||||
|
from .seed import load_seeds
|
68
src/atextcrawler/site/__main__.py
Normal file
68
src/atextcrawler/site/__main__.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
"""
|
||||||
|
Tool for analyzing a website.
|
||||||
|
|
||||||
|
Fetch the startpage and output information to console.
|
||||||
|
Do not change any persistent data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
from ..models import TextResource
|
||||||
|
from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls
|
||||||
|
from ..site.robots import RobotsInfo
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
from .parse import parse_startpage
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.setLevel(logging.WARNING)
|
||||||
|
logger.addHandler(logging.StreamHandler())
|
||||||
|
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
"""
|
||||||
|
Fetch the startpage of a website and show information about it.
|
||||||
|
|
||||||
|
The URL must be given as commandline argument.
|
||||||
|
"""
|
||||||
|
base_url = sys.argv[1]
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
if not (base_durl := await Durl(base_url)):
|
||||||
|
return
|
||||||
|
fetcher = ResourceFetcher(session)
|
||||||
|
resource = await fetcher.fetch(base_url)
|
||||||
|
logger.warning(repr(resource))
|
||||||
|
if (
|
||||||
|
isinstance(resource, TextResource)
|
||||||
|
and resource.content_type == 'html'
|
||||||
|
):
|
||||||
|
site = await parse_startpage(resource)
|
||||||
|
# site.crawl_enabled = await site_filter(site)
|
||||||
|
logger.warning(repr(site))
|
||||||
|
logger.warning('')
|
||||||
|
for durl, text in site.links_ext.items():
|
||||||
|
logger.warning(f' {durl} {text}')
|
||||||
|
logger.warning(f'{durl.url()} -------- {text}')
|
||||||
|
logger.warning('')
|
||||||
|
logger.warning(f'Redirects: {resource.init_fields["redirects"]}')
|
||||||
|
logger.warning('')
|
||||||
|
robots = await RobotsInfo(base_url)
|
||||||
|
urls = await get_sitemap_urls(
|
||||||
|
fetcher, base_url, sitemaps=robots.site_maps
|
||||||
|
)
|
||||||
|
paths, latest = extract_sitemap_paths(base_url, urls)
|
||||||
|
for path in paths:
|
||||||
|
logger.warning(path)
|
||||||
|
logger.warning(f'Feeds: {site.feeds}')
|
||||||
|
logger.warning(latest)
|
||||||
|
# sample_links = extract_samples(resource.init_fields['links_int'])
|
||||||
|
# logger.warning(f'************* {sample_links}')
|
||||||
|
else:
|
||||||
|
logger.warning('(No text resource or error.)')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
asyncio.run(run())
|
100
src/atextcrawler/site/feeds.py
Normal file
100
src/atextcrawler/site/feeds.py
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
"""
|
||||||
|
High-level feed-related stuff.
|
||||||
|
|
||||||
|
See resource.feed for low-level stuff not primarily related to sites.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ..models import Feed
|
||||||
|
from ..resource import store_feed_entries, update_feed
|
||||||
|
|
||||||
|
|
||||||
|
async def store_new_feeds(conn, site_id, feeds: dict):
|
||||||
|
"""
|
||||||
|
Store new feeds in table site_feed.
|
||||||
|
"""
|
||||||
|
sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1"
|
||||||
|
known_feeds = (await conn.fetchval(sql, site_id)) or []
|
||||||
|
for feed_url in feeds.keys():
|
||||||
|
if feed_url not in known_feeds:
|
||||||
|
feed = Feed(
|
||||||
|
site_id=site_id,
|
||||||
|
url=feed_url,
|
||||||
|
)
|
||||||
|
await feed.save(conn)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_feeds(conn, site_id) -> list[Feed]:
|
||||||
|
"""
|
||||||
|
Return stored feeds for the given site.
|
||||||
|
"""
|
||||||
|
sql = "SELECT * FROM site_feed WHERE site_id=$1"
|
||||||
|
rows = (await conn.fetch(sql, site_id)) or []
|
||||||
|
return [(await Feed().load_from_row(row)) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
Fetch feeds, add new resources and return the latest content update time.
|
||||||
|
"""
|
||||||
|
feeds = await get_feeds(conn, site.id_)
|
||||||
|
latest = None
|
||||||
|
for feed in feeds:
|
||||||
|
feed_content = await update_feed(fetcher, feed, conn)
|
||||||
|
if feed_content:
|
||||||
|
await store_feed_entries(conn, site, feed_content)
|
||||||
|
if feed.t_content:
|
||||||
|
latest = max(latest or feed.t_content, feed.t_content)
|
||||||
|
return latest
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# only use this on a dev instance!
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
from ..config import Config
|
||||||
|
from ..db import PGPool
|
||||||
|
from ..resource.fetch import ResourceFetcher
|
||||||
|
from .operations import process_site, update_site
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
config = Config().get()
|
||||||
|
url = sys.argv[1]
|
||||||
|
|
||||||
|
async def run():
|
||||||
|
"""
|
||||||
|
Fetch and display a site.
|
||||||
|
"""
|
||||||
|
app = None # TODO
|
||||||
|
async with PGPool(config['postgresql']) as pool:
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
fetcher = ResourceFetcher(session)
|
||||||
|
site, _ = await update_site(app, fetcher, conn, url)
|
||||||
|
logger.warning(site)
|
||||||
|
await process_site(fetcher, conn, site)
|
||||||
|
latest = await fetch_feeds(fetcher, conn, site)
|
||||||
|
logger.warning(f'latest: {latest}')
|
||||||
|
# feed = Feed(url=url)
|
||||||
|
# feed_content = await update_feed(fetcher, feed, conn)
|
||||||
|
# if isinstance(feed_content, ResourceError):
|
||||||
|
# print(feed_content)
|
||||||
|
# else:
|
||||||
|
# print(feed)
|
||||||
|
# pprint(feed_content[0])
|
||||||
|
# print('---- 2nd try ----')
|
||||||
|
# feed_content = await update_feed(fetcher, feed, conn)
|
||||||
|
# if isinstance(feed_content, ResourceError):
|
||||||
|
# print(feed_content)
|
||||||
|
# else:
|
||||||
|
# print(feed)
|
||||||
|
# pprint(feed_content[0])
|
||||||
|
|
||||||
|
asyncio.run(run())
|
267
src/atextcrawler/site/operations.py
Normal file
267
src/atextcrawler/site/operations.py
Normal file
|
@ -0,0 +1,267 @@
|
||||||
|
"""
|
||||||
|
Operations on sites.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from asyncpg import Connection
|
||||||
|
|
||||||
|
from ..models import Crawl, Site, TextResource
|
||||||
|
from ..resource import (
|
||||||
|
add_site_paths,
|
||||||
|
extract_sitemap_paths,
|
||||||
|
get_sitemap_urls,
|
||||||
|
store_boilerplate_texts,
|
||||||
|
)
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
from ..utils.similarity import get_simhash_index
|
||||||
|
from .feeds import fetch_feeds, store_new_feeds
|
||||||
|
from .parse import parse_startpage
|
||||||
|
from .robots import RobotsInfo
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def checkout_site(
|
||||||
|
app, conn: Connection
|
||||||
|
) -> tuple[Optional[int], bool, bool]:
|
||||||
|
"""
|
||||||
|
Get the id of a site to be crawled and mark it with crawl_active=true.
|
||||||
|
|
||||||
|
Also return whether the site shall be fully crawled; if not, this
|
||||||
|
means that just the resources from the feeds shall be crawled.
|
||||||
|
|
||||||
|
Also return whether more sites might be available.
|
||||||
|
"""
|
||||||
|
async with conn.transaction():
|
||||||
|
sql = (
|
||||||
|
"SELECT id, next_full_crawl < now() at time zone 'UTC' is_full"
|
||||||
|
" FROM site WHERE crawl_enabled AND crawl_active = false"
|
||||||
|
" AND (next_full_crawl < now() at time zone 'UTC'"
|
||||||
|
" OR next_feed_crawl < now() at time zone 'UTC')"
|
||||||
|
" LIMIT 1 FOR UPDATE SKIP LOCKED"
|
||||||
|
)
|
||||||
|
row = await conn.fetchrow(sql)
|
||||||
|
if row:
|
||||||
|
site_id = row['id']
|
||||||
|
is_full = row['is_full']
|
||||||
|
sql = "UPDATE site SET crawl_active = true WHERE id=$1"
|
||||||
|
await conn.execute(sql, site_id)
|
||||||
|
site = await Site().load(conn, site_id)
|
||||||
|
if site:
|
||||||
|
site.base_durl = await Durl(site.base_url)
|
||||||
|
if site.base_durl:
|
||||||
|
site.simhash_index = await get_simhash_index(conn, site_id)
|
||||||
|
return site, is_full, True
|
||||||
|
else:
|
||||||
|
# site not available; schedule next crawl
|
||||||
|
int_full = app.config['crawl']['full_crawl_interval']
|
||||||
|
int_feed = app.config['crawl']['feed_crawl_interval']
|
||||||
|
now = datetime.utcnow()
|
||||||
|
t_full = now + timedelta(seconds=int_full)
|
||||||
|
t_feed = now + timedelta(seconds=int_full + int_feed)
|
||||||
|
sql = (
|
||||||
|
"UPDATE site SET crawl_active=false,"
|
||||||
|
" next_full_crawl=$1, next_feed_crawl=$2"
|
||||||
|
" WHERE id=$3"
|
||||||
|
)
|
||||||
|
await conn.execute(sql, t_full, t_feed, site_id)
|
||||||
|
return None, False, True
|
||||||
|
return None, False, True
|
||||||
|
return None, False, False
|
||||||
|
|
||||||
|
|
||||||
|
async def update_site(
|
||||||
|
app, fetcher, conn: Connection, base_url, site: Site = None
|
||||||
|
) -> tuple[Optional[Site], bool]:
|
||||||
|
"""
|
||||||
|
Try to fetch base_url and return a site and whether a new one was created.
|
||||||
|
|
||||||
|
This function is run for all sites (including blacklisted and irrelevant
|
||||||
|
ones). It determines whether the site shall be crawled.
|
||||||
|
|
||||||
|
If an errors occurs, return (None, False), and if a site was given,
|
||||||
|
also set it to crawl_enabled=False and remove crawling schedules.
|
||||||
|
|
||||||
|
If base_url could be fetched, update the site, possibly creating
|
||||||
|
a new one.
|
||||||
|
|
||||||
|
If the site has crawl_enabled, and no full crawl is scheduled,
|
||||||
|
schedule one (by updating column `next_full_crawl`).
|
||||||
|
"""
|
||||||
|
# fetch startpage
|
||||||
|
logger.info(f'Updating site={site}, base_url={base_url}')
|
||||||
|
resource = await fetcher.fetch(base_url, site=site)
|
||||||
|
if (
|
||||||
|
not isinstance(resource, TextResource)
|
||||||
|
or resource.content_type != 'html'
|
||||||
|
):
|
||||||
|
if site:
|
||||||
|
site.meta_info['error'] = 'Invalid start page'
|
||||||
|
site.crawl_enabled = False
|
||||||
|
site.next_full_crawl = None
|
||||||
|
site.next_feed_crawl = None
|
||||||
|
await site.save(conn)
|
||||||
|
logger.info(f'Failed startpage {base_url}: {resource}')
|
||||||
|
return None, False
|
||||||
|
|
||||||
|
# parse startpage (extract site information) and save the site
|
||||||
|
site = await parse_startpage(resource, app=app, site=site)
|
||||||
|
site_id, created = await site.save(conn)
|
||||||
|
if created:
|
||||||
|
logger.debug(f'Created {site}')
|
||||||
|
|
||||||
|
# add black-/white-listing info
|
||||||
|
is_allowed = await is_site_allowed(conn, site.id_, base_url)
|
||||||
|
if is_allowed is not None and is_allowed != site.crawl_enabled:
|
||||||
|
site.crawl_enabled = is_allowed
|
||||||
|
await site.save(conn)
|
||||||
|
|
||||||
|
# schedule full crawl, if none is scheduled and the site shall be crawled
|
||||||
|
if site.crawl_enabled:
|
||||||
|
sql = (
|
||||||
|
"UPDATE site"
|
||||||
|
" SET next_full_crawl=now() at time zone 'UTC'"
|
||||||
|
" WHERE id=$1 AND next_full_crawl IS null"
|
||||||
|
)
|
||||||
|
await conn.execute(sql, site_id)
|
||||||
|
|
||||||
|
return site, created
|
||||||
|
|
||||||
|
|
||||||
|
async def is_site_allowed(
|
||||||
|
conn: Connection,
|
||||||
|
site_id: Optional[int],
|
||||||
|
base_url: str,
|
||||||
|
) -> Optional[bool]:
|
||||||
|
"""
|
||||||
|
Return True if the site is whitelisted, False if blacklisted, else None.
|
||||||
|
|
||||||
|
Also add missing site_ids to the annotations.
|
||||||
|
"""
|
||||||
|
sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2"
|
||||||
|
anns = await conn.fetch(sql, site_id, base_url)
|
||||||
|
for ann in anns:
|
||||||
|
if ann['ann_type'] == 'blacklist':
|
||||||
|
return False
|
||||||
|
if ann['ann_type'] == 'whitelist':
|
||||||
|
return True
|
||||||
|
# add missing site_ids
|
||||||
|
if site_id and any([ann['site_id'] is None for ann in anns]):
|
||||||
|
sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2"
|
||||||
|
await conn.execute(sql, site_id, base_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def process_site(fetcher, conn: Connection, site: Site):
|
||||||
|
"""
|
||||||
|
Process a site: fetch and store more information.
|
||||||
|
|
||||||
|
Store external and internal links; find boilerplate texts;
|
||||||
|
fetch sitemaps; fetch feeds; update date of last publication.
|
||||||
|
"""
|
||||||
|
if not site.id_: # only to satisfy typing
|
||||||
|
return
|
||||||
|
if site.links_ext:
|
||||||
|
await _store_cross_site_links(conn, site.id_, site.links_ext)
|
||||||
|
if site.links_int:
|
||||||
|
paths = []
|
||||||
|
for durl, (rel, _) in site.links_int.items():
|
||||||
|
canon = (rel and rel.lower() == 'canonical') or None
|
||||||
|
paths.append((durl.pwa(), canon))
|
||||||
|
await add_site_paths(conn, site.id_, paths)
|
||||||
|
|
||||||
|
await store_boilerplate_texts(fetcher, conn, site)
|
||||||
|
|
||||||
|
# get sitemaps and add their resources
|
||||||
|
robots = await RobotsInfo(site.base_url) # type: ignore
|
||||||
|
urls = await get_sitemap_urls(
|
||||||
|
fetcher, site.base_url, sitemaps=robots.site_maps
|
||||||
|
)
|
||||||
|
paths_, latest = extract_sitemap_paths(site.base_url, urls)
|
||||||
|
await add_site_paths(conn, site.id_, paths_)
|
||||||
|
|
||||||
|
# store feeds and their resources
|
||||||
|
await store_new_feeds(conn, site.id_, site.feeds)
|
||||||
|
latest_ = await fetch_feeds(fetcher, conn, site)
|
||||||
|
if latest_:
|
||||||
|
latest = max(latest or latest_, latest_)
|
||||||
|
|
||||||
|
# update last_pub
|
||||||
|
if latest:
|
||||||
|
site.last_pub = latest
|
||||||
|
await site.save(conn)
|
||||||
|
|
||||||
|
|
||||||
|
async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl):
|
||||||
|
"""
|
||||||
|
Unlock the site and schedule next crawl.
|
||||||
|
|
||||||
|
*crawl* is the crawl that has just finished (regularly or stopped).
|
||||||
|
|
||||||
|
If the crawl was stopped (t_end is None), just unlock the site.
|
||||||
|
|
||||||
|
Otherwise schedule a crawl of the same type. After a full crawl
|
||||||
|
also a feed crawl is scheduled, if there was none scheduled.
|
||||||
|
"""
|
||||||
|
if crawl.t_end is None:
|
||||||
|
sql = "UPDATE site SET crawl_active=false WHERE id=$1"
|
||||||
|
await conn.execute(sql, site.id_)
|
||||||
|
elif crawl.is_full:
|
||||||
|
full_interval = app.config['crawl']['full_crawl_interval']
|
||||||
|
feed_interval = app.config['crawl']['feed_crawl_interval']
|
||||||
|
next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval)
|
||||||
|
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
|
||||||
|
sql = (
|
||||||
|
"UPDATE site SET crawl_active=false, next_full_crawl=$1,"
|
||||||
|
" next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3"
|
||||||
|
)
|
||||||
|
await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_)
|
||||||
|
else:
|
||||||
|
feed_interval = app.config['crawl']['feed_crawl_interval']
|
||||||
|
next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval)
|
||||||
|
sql = (
|
||||||
|
"UPDATE site SET crawl_active=false, next_feed_crawl=$1"
|
||||||
|
" WHERE id=$2"
|
||||||
|
)
|
||||||
|
await conn.execute(sql, next_feed_crawl, site.id_)
|
||||||
|
|
||||||
|
|
||||||
|
async def _store_cross_site_links(
|
||||||
|
conn: Connection,
|
||||||
|
site_id: int,
|
||||||
|
links: dict[Durl, tuple[list[str], str]],
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Put outgoing links into site_link/site_queue for existing/unknown sites.
|
||||||
|
|
||||||
|
Separate outgoing links from *site_id* into two classes:
|
||||||
|
(a) existing sites (rows in table site) and (b) unknown links.
|
||||||
|
Add links from class (a) to table site_link.
|
||||||
|
Add links from class (b) to table site_queue.
|
||||||
|
"""
|
||||||
|
# add outgoing cross-site links for existing sites to table site_link
|
||||||
|
urls = [url.site() for url in links.keys()]
|
||||||
|
values = []
|
||||||
|
sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1"
|
||||||
|
if rows := await conn.fetch(sql, urls):
|
||||||
|
for row in rows:
|
||||||
|
if (durl := await Durl(row['url'])) in links.keys():
|
||||||
|
_, link_text = links.pop(durl)
|
||||||
|
if site_id != row['id']:
|
||||||
|
values.append((site_id, row['id'], link_text))
|
||||||
|
sql = (
|
||||||
|
"INSERT INTO site_link (src, dst, link_text)"
|
||||||
|
" VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING"
|
||||||
|
)
|
||||||
|
await conn.executemany(sql, values)
|
||||||
|
|
||||||
|
# add outgoing cross-site links for unknown sites to table site_queue
|
||||||
|
sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)"
|
||||||
|
values = [
|
||||||
|
(site_id, durl.site()[:200], link_text[:100])
|
||||||
|
for durl, (_, link_text) in links.items()
|
||||||
|
]
|
||||||
|
await conn.executemany(sql, values)
|
255
src/atextcrawler/site/parse.py
Normal file
255
src/atextcrawler/site/parse.py
Normal file
|
@ -0,0 +1,255 @@
|
||||||
|
"""
|
||||||
|
Parsing of a site's startpage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from ..models import Site, TextResource
|
||||||
|
from ..resource import feed_types
|
||||||
|
from ..utils.durl import Durl, get_ips
|
||||||
|
from ..utils.html import clean_html
|
||||||
|
from ..utils.lang import clean_lang
|
||||||
|
from ..utils.link import (
|
||||||
|
extract_domain,
|
||||||
|
in_blacklist,
|
||||||
|
link_rels,
|
||||||
|
meta_names,
|
||||||
|
meta_props,
|
||||||
|
)
|
||||||
|
|
||||||
|
re_meta_keyword_sep = re.compile('[,;\r\n]')
|
||||||
|
|
||||||
|
|
||||||
|
def cut_str(s: Optional[str], l: int) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Cut a string *s* to a maximal length *l* from the left.
|
||||||
|
"""
|
||||||
|
return s[:l] if s else None
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_startpage(
|
||||||
|
startpage: TextResource, app=None, site=None
|
||||||
|
) -> Site:
|
||||||
|
"""
|
||||||
|
Parse a site's startpage and return a Site instance.
|
||||||
|
|
||||||
|
If a site instance is given, update it.
|
||||||
|
"""
|
||||||
|
durl = startpage.init_fields['durl']
|
||||||
|
soup = startpage.init_fields['head']
|
||||||
|
meta = collect_meta_tags(soup)
|
||||||
|
meta_links = await collect_meta_links(soup, durl)
|
||||||
|
links_ext = await collect_external_links(startpage, meta_links)
|
||||||
|
links_int = startpage.init_fields['links_int']
|
||||||
|
langs = extract_languages(startpage, meta, meta_links)
|
||||||
|
title, description, keywords = extract_meta_texts(startpage, meta)
|
||||||
|
|
||||||
|
# feeds
|
||||||
|
feeds = meta_links['feeds']
|
||||||
|
if 'wordpress' in meta.get('generator', '').lower():
|
||||||
|
url = durl.site() + 'feed/'
|
||||||
|
feeds[url] = 'application/rss+xml'
|
||||||
|
# TODO later: maybe also probe other possible feed paths 'rss', 'rss/'
|
||||||
|
|
||||||
|
# network params (canonical_url, base_urls, domains)
|
||||||
|
ips = await get_ips(durl.hostname)
|
||||||
|
redirects = []
|
||||||
|
for redirect in startpage.init_fields['redirects']:
|
||||||
|
redir_url = await Durl(redirect)
|
||||||
|
if redir_url:
|
||||||
|
redirects.append(redir_url.site())
|
||||||
|
base_urls = redirects + [durl.url()]
|
||||||
|
domains = [extract_domain(durl.hostname)]
|
||||||
|
|
||||||
|
if site: # update an existing Site
|
||||||
|
site.canonical_url = meta_links['canonical_url'] or site.canonical_url
|
||||||
|
site.base_urls = base_urls
|
||||||
|
site.domains = domains
|
||||||
|
site.ips = ips
|
||||||
|
site.last_update = datetime.utcnow()
|
||||||
|
site.last_pub = startpage.last_change
|
||||||
|
site.langs = langs
|
||||||
|
site.alt_langs = meta_links['alt_langs']
|
||||||
|
site.title = title
|
||||||
|
site.description = description
|
||||||
|
site.keywords = keywords
|
||||||
|
site.linkbacks.update(meta_links['linkbacks'])
|
||||||
|
site.meta_info = meta
|
||||||
|
site.__post_init__(
|
||||||
|
base_durl=durl,
|
||||||
|
feeds=feeds,
|
||||||
|
links_ext=links_ext,
|
||||||
|
links_int=links_int,
|
||||||
|
startpage_text=startpage.search_fields['text'],
|
||||||
|
)
|
||||||
|
else: # create new Site instance
|
||||||
|
site = Site(
|
||||||
|
# post_init fields
|
||||||
|
base_durl=durl,
|
||||||
|
feeds=feeds,
|
||||||
|
links_ext=links_ext,
|
||||||
|
links_int=links_int,
|
||||||
|
startpage_text=startpage.search_fields['text'],
|
||||||
|
# dataclass fields
|
||||||
|
canonical_url=meta_links['canonical_url'],
|
||||||
|
base_urls=base_urls,
|
||||||
|
domains=domains,
|
||||||
|
ips=ips,
|
||||||
|
last_update=datetime.utcnow(),
|
||||||
|
last_pub=startpage.last_change,
|
||||||
|
langs=list(langs),
|
||||||
|
alt_langs=meta_links['alt_langs'],
|
||||||
|
title=title,
|
||||||
|
description=description,
|
||||||
|
keywords=keywords,
|
||||||
|
linkbacks=meta_links['linkbacks'],
|
||||||
|
meta_info=meta,
|
||||||
|
)
|
||||||
|
if site.ips is None and site.url:
|
||||||
|
site.ips = await get_ips(site.url.hostname)
|
||||||
|
if app and site.startpage_text:
|
||||||
|
site_filter = app.plugins['filter_site'].site_filter
|
||||||
|
site.crawl_enabled = await site_filter(site)
|
||||||
|
return site
|
||||||
|
|
||||||
|
|
||||||
|
def collect_meta_tags(soup):
|
||||||
|
"""
|
||||||
|
Collect selected meta tags (meta_names and meta_props) with their values.
|
||||||
|
"""
|
||||||
|
meta = {}
|
||||||
|
for tag in soup.find_all('meta'):
|
||||||
|
if (name := tag.get('name')) and name in meta_names:
|
||||||
|
meta[name] = tag.get('content')
|
||||||
|
if (property := tag.get('property')) in meta_props:
|
||||||
|
if content := tag.get('content'):
|
||||||
|
meta[property] = content
|
||||||
|
if tag.get('http-equiv') == 'content-language': # old html
|
||||||
|
if content := tag.get('content'):
|
||||||
|
meta['http_equiv_lang'] = content
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
async def collect_meta_links(soup, base_durl) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Collect link tags with site scope (feeds, linkbacks, canonical, ...).
|
||||||
|
"""
|
||||||
|
linkbacks = {}
|
||||||
|
feeds = {}
|
||||||
|
alt_langs = {}
|
||||||
|
canonical_url = None
|
||||||
|
for tag in soup.find_all('link'):
|
||||||
|
if not (rels := set(tag.get('rel', []))) or not rels & link_rels:
|
||||||
|
continue
|
||||||
|
if not (url := tag.get('href')):
|
||||||
|
continue
|
||||||
|
if not (link_durl := await Durl(url, base=base_durl)):
|
||||||
|
continue
|
||||||
|
if in_blacklist(link_durl.hostname):
|
||||||
|
continue
|
||||||
|
link_url = link_durl.url()
|
||||||
|
link_type = tag.get('type')
|
||||||
|
if link_type in feed_types:
|
||||||
|
feeds[link_url] = link_type
|
||||||
|
elif 'canonical' in rels:
|
||||||
|
canonical_url = link_url
|
||||||
|
elif 'alternate' in rels and (hreflang := tag.get('hreflang')):
|
||||||
|
if lang := clean_lang(hreflang):
|
||||||
|
alt_langs[lang] = link_durl.url()
|
||||||
|
elif 'webmention' in rels:
|
||||||
|
linkbacks[link_url] = 'webmention'
|
||||||
|
elif 'pingback' in rels:
|
||||||
|
linkbacks[link_url] = 'pingback'
|
||||||
|
if canonical_url:
|
||||||
|
if canonical_durl := await Durl(canonical_url):
|
||||||
|
canonical_url = canonical_durl.site()
|
||||||
|
else:
|
||||||
|
canonical_url = None
|
||||||
|
return {
|
||||||
|
'feeds': feeds,
|
||||||
|
'linkbacks': linkbacks,
|
||||||
|
'alt_langs': alt_langs,
|
||||||
|
'canonical_url': canonical_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def collect_external_links(startpage, meta_links) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Return external links (mapping from URL to link text) from startpage.
|
||||||
|
|
||||||
|
Also add links to alternate language variants of the site.
|
||||||
|
"""
|
||||||
|
external_links = startpage.init_fields['links_ext'].copy()
|
||||||
|
netloc = startpage.init_fields['durl'].netloc
|
||||||
|
for lang, lang_url in meta_links['alt_langs'].items():
|
||||||
|
if netloc not in lang_url:
|
||||||
|
durl = await Durl(lang_url)
|
||||||
|
if durl:
|
||||||
|
external_links[durl] = f'Alternate language: {lang}'
|
||||||
|
return external_links
|
||||||
|
|
||||||
|
|
||||||
|
def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]:
|
||||||
|
"""
|
||||||
|
Extract and return title, description, keywords from a page and meta tags.
|
||||||
|
"""
|
||||||
|
title = meta.get('og:site_name')
|
||||||
|
if not title:
|
||||||
|
title = page.search_fields['title'] or ''
|
||||||
|
if meta_title := meta.pop('title', None):
|
||||||
|
if meta_title.lower() not in title.lower():
|
||||||
|
title += ('; ' if title else '') + meta_title
|
||||||
|
title = cut_str(clean_html(title), 200)
|
||||||
|
description = cut_str(clean_html(meta.pop('description', None)), 2000)
|
||||||
|
if meta_keywords := meta.pop('keywords', None):
|
||||||
|
kws = re_meta_keyword_sep.split(meta_keywords)
|
||||||
|
keywords = [kw.strip()[:50] for kw in kws if kw.strip()]
|
||||||
|
if len(keywords) < 2:
|
||||||
|
keywords = [
|
||||||
|
kw.strip()[:50]
|
||||||
|
for kw in meta_keywords.split(' ')
|
||||||
|
if kw.strip()
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
keywords = []
|
||||||
|
return title, description, keywords
|
||||||
|
|
||||||
|
|
||||||
|
def extract_languages(page, meta, meta_links) -> set[str]:
|
||||||
|
"""
|
||||||
|
Extract languages from a page's html tag, meta tags and HTTP headers.
|
||||||
|
|
||||||
|
Also add the language detected in the text content of the page.
|
||||||
|
|
||||||
|
Return a set of ISO 639-1 language codes.
|
||||||
|
|
||||||
|
See also https://www.w3.org/International/questions/qa-http-and-lang and
|
||||||
|
https://www.w3.org/International/questions/qa-html-language-declarations
|
||||||
|
"""
|
||||||
|
languages = set()
|
||||||
|
if lang := clean_lang(page.lang):
|
||||||
|
languages.add(lang)
|
||||||
|
if lang := clean_lang(meta.get('http_equiv_lang')):
|
||||||
|
languages.add(lang)
|
||||||
|
if lang := clean_lang(meta.get('dc.language')):
|
||||||
|
languages.add(lang)
|
||||||
|
if lang := clean_lang(meta.get('og:locale')):
|
||||||
|
languages.add(lang)
|
||||||
|
for lang, lang_url in meta_links['alt_langs'].items():
|
||||||
|
if page.init_fields['durl'].netloc in lang_url:
|
||||||
|
if lng := clean_lang(lang):
|
||||||
|
languages.add(lng)
|
||||||
|
lngs = (
|
||||||
|
page.init_fields['headers']
|
||||||
|
.get('Content-Language', '')
|
||||||
|
.lower()
|
||||||
|
.replace(' ', '')
|
||||||
|
.split(',')
|
||||||
|
)
|
||||||
|
for lng in lngs:
|
||||||
|
if lang := clean_lang(lng):
|
||||||
|
languages.add(lang)
|
||||||
|
languages.add(page.lang)
|
||||||
|
return languages
|
127
src/atextcrawler/site/queue.py
Normal file
127
src/atextcrawler/site/queue.py
Normal file
|
@ -0,0 +1,127 @@
|
||||||
|
"""
|
||||||
|
Queue of sites.
|
||||||
|
|
||||||
|
When processing a resource, its external links are put into database table
|
||||||
|
`site_queue`.
|
||||||
|
The items in `site_queue` are processed in :func:`process_site_queue`.
|
||||||
|
This is done baseURL by baseURL (see :func:`iter_site_queue`).
|
||||||
|
While doing this, cross-site links are put into table `site_link`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import AsyncIterator, Optional
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from asyncpg import Connection
|
||||||
|
|
||||||
|
from ..resource import ResourceFetcher
|
||||||
|
from .operations import update_site
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def process_site_queue(app, pool):
|
||||||
|
"""
|
||||||
|
Loop over queued sites creating new sites and adding cross-site links.
|
||||||
|
"""
|
||||||
|
site_delay = app.config['crawl']['site_delay']
|
||||||
|
resource_delay = app.config['crawl']['resource_delay']
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
fetcher = ResourceFetcher(session)
|
||||||
|
while app.running:
|
||||||
|
async for base_url, links_from in iter_site_queue(app, conn):
|
||||||
|
# get or create site
|
||||||
|
msg = f'Site queue: updating {base_url}'
|
||||||
|
logger.debug(msg)
|
||||||
|
site, created = await update_site(
|
||||||
|
app, fetcher, conn, base_url
|
||||||
|
)
|
||||||
|
if site:
|
||||||
|
await store_incoming_site_site_links(
|
||||||
|
conn, site.id_, links_from
|
||||||
|
)
|
||||||
|
# delete handled queue items
|
||||||
|
sql = "DELETE FROM site_queue WHERE url=$1"
|
||||||
|
await conn.execute(sql, base_url)
|
||||||
|
await app.sleep(resource_delay)
|
||||||
|
logger.debug(
|
||||||
|
f'Queued sites exhausted, sleeping'
|
||||||
|
f' for {site_delay} seconds'
|
||||||
|
)
|
||||||
|
await app.sleep(site_delay)
|
||||||
|
|
||||||
|
|
||||||
|
async def iter_site_queue(
|
||||||
|
app, conn: Connection
|
||||||
|
) -> AsyncIterator[tuple[str, dict[int, str]]]:
|
||||||
|
"""
|
||||||
|
Yield URLs with aggregated link information from site_queue.
|
||||||
|
|
||||||
|
Yield a URL and a dict mapping ids of linking sites to link texts.
|
||||||
|
"""
|
||||||
|
site_revisit_interval = app.config['crawl']['site_revisit_interval']
|
||||||
|
while app.running:
|
||||||
|
sql = (
|
||||||
|
"SELECT url, array_agg(src) srcs,"
|
||||||
|
" array_agg(link_text) link_texts"
|
||||||
|
" FROM site_queue GROUP BY url LIMIT 1"
|
||||||
|
)
|
||||||
|
row = await conn.fetchrow(sql)
|
||||||
|
if row:
|
||||||
|
base_url = row['url']
|
||||||
|
links_from = {}
|
||||||
|
srcs = row['srcs']
|
||||||
|
link_texts = row['link_texts']
|
||||||
|
for i in range(len(srcs)):
|
||||||
|
if src := srcs[i]:
|
||||||
|
links_from[src] = link_texts[i]
|
||||||
|
if site_id := await site_recently_updated(
|
||||||
|
conn, base_url, site_revisit_interval
|
||||||
|
):
|
||||||
|
# just store incoming links and remove the site from the queue
|
||||||
|
await store_incoming_site_site_links(conn, site_id, links_from)
|
||||||
|
sql = "DELETE FROM site_queue WHERE url=$1"
|
||||||
|
await conn.execute(sql, base_url)
|
||||||
|
else:
|
||||||
|
yield base_url, links_from
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
async def site_recently_updated(
|
||||||
|
conn: Connection,
|
||||||
|
base_url: str,
|
||||||
|
site_revisit_interval: float,
|
||||||
|
) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
Return the id of the site with given base_url if it was updated recently.
|
||||||
|
"""
|
||||||
|
sql = (
|
||||||
|
f"SELECT id FROM site WHERE $1=any(base_urls)"
|
||||||
|
f" AND last_update + interval '{site_revisit_interval} seconds'"
|
||||||
|
f" > now() at time zone 'utc' LIMIT 1"
|
||||||
|
)
|
||||||
|
site_id = await conn.fetchval(sql, base_url)
|
||||||
|
return site_id
|
||||||
|
|
||||||
|
|
||||||
|
async def store_incoming_site_site_links(
|
||||||
|
conn: Connection, site_id: int, links_from: dict
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Store incoming site-site links (irrespective of crawl_enabled).
|
||||||
|
|
||||||
|
*site_id* is the id of the site to which the links in *links_from* point.
|
||||||
|
"""
|
||||||
|
sql = (
|
||||||
|
"INSERT INTO site_link"
|
||||||
|
" (src, dst, link_text) VALUES ($1, $2, $3)"
|
||||||
|
" ON CONFLICT (src, dst) DO NOTHING"
|
||||||
|
)
|
||||||
|
values = [
|
||||||
|
(from_id, site_id, link_text)
|
||||||
|
for from_id, link_text in links_from.items()
|
||||||
|
if from_id != site_id
|
||||||
|
]
|
||||||
|
await conn.executemany(sql, values)
|
98
src/atextcrawler/site/robots.py
Normal file
98
src/atextcrawler/site/robots.py
Normal file
|
@ -0,0 +1,98 @@
|
||||||
|
"""
|
||||||
|
Fetch and evaluate a website's robots.txt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional, Union
|
||||||
|
from urllib.robotparser import RobotFileParser
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class RobotsInfo(RobotFileParser):
|
||||||
|
"""
|
||||||
|
Obtain information from a site's robots.txt.
|
||||||
|
|
||||||
|
After instantiation you must await :meth:`startup`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
site_url: str,
|
||||||
|
user_agent: str = '*',
|
||||||
|
session: aiohttp.ClientSession = None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.__user_agent = user_agent
|
||||||
|
self.__site_url = site_url.rstrip('/')
|
||||||
|
self.__robots_url = self.__site_url + '/robots.txt'
|
||||||
|
self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3)
|
||||||
|
self.__session = session
|
||||||
|
|
||||||
|
def __await__(self):
|
||||||
|
return self.__ainit__().__await__()
|
||||||
|
|
||||||
|
async def __ainit__(self):
|
||||||
|
if self.__session:
|
||||||
|
content = await self.__get_robots_txt(self.__session)
|
||||||
|
else:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
content = await self.__get_robots_txt(session)
|
||||||
|
self.parse(content.splitlines())
|
||||||
|
self.__delay = self.crawl_delay(self.__user_agent)
|
||||||
|
request_rate = self.request_rate(self.__user_agent)
|
||||||
|
if request_rate:
|
||||||
|
self.__delay = request_rate.seconds / request_rate.requests
|
||||||
|
self.__site_maps = super().site_maps() or []
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str:
|
||||||
|
"""
|
||||||
|
Fetch and return the robots.txt over http.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
async with session.get(
|
||||||
|
self.__robots_url, timeout=self.__timeout
|
||||||
|
) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
try:
|
||||||
|
content = await resp.text()
|
||||||
|
except:
|
||||||
|
body = await resp.read()
|
||||||
|
content = body.decode(
|
||||||
|
resp.charset or 'utf-8', errors='ignore'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
content = ''
|
||||||
|
except aiohttp.ClientError:
|
||||||
|
content = ''
|
||||||
|
return content
|
||||||
|
|
||||||
|
@property
|
||||||
|
def user_agent(self) -> str:
|
||||||
|
"""
|
||||||
|
The user agent being used.
|
||||||
|
"""
|
||||||
|
return self.__user_agent
|
||||||
|
|
||||||
|
@property
|
||||||
|
def delay(self) -> Optional[Union[int, float]]:
|
||||||
|
"""
|
||||||
|
The delay to be used between requests.
|
||||||
|
"""
|
||||||
|
return self.__delay
|
||||||
|
|
||||||
|
@property
|
||||||
|
def site_maps(self) -> list[str]:
|
||||||
|
"""
|
||||||
|
The list of sitemaps of the site.
|
||||||
|
"""
|
||||||
|
return self.__site_maps
|
||||||
|
|
||||||
|
def can_fetch_url(self, url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Return whether fetching of the given *url* is allowed.
|
||||||
|
"""
|
||||||
|
return super().can_fetch(self.__user_agent, url)
|
72
src/atextcrawler/site/seed.py
Normal file
72
src/atextcrawler/site/seed.py
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
"""
|
||||||
|
Seeding of new installations with URLs from blacklists and whitelists.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from ..utils.durl import Durl
|
||||||
|
|
||||||
|
|
||||||
|
async def load_seeds(config: dict, pool: asyncpg.Pool) -> None:
|
||||||
|
"""
|
||||||
|
Add seed file contents (site blacklist and whitelist).
|
||||||
|
|
||||||
|
If there are sites already, do nothing.
|
||||||
|
"""
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
site_count = await conn.fetchval("SELECT count(*) FROM site")
|
||||||
|
if site_count:
|
||||||
|
return
|
||||||
|
|
||||||
|
# add blacklist entries
|
||||||
|
values = []
|
||||||
|
blacklist = _load_list(config['config_dir'], 'black')
|
||||||
|
for base_url in blacklist:
|
||||||
|
durl = await Durl(base_url)
|
||||||
|
if durl:
|
||||||
|
url = durl.site()
|
||||||
|
values.append((url, {'source': 'seed file'}))
|
||||||
|
sql = (
|
||||||
|
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
|
||||||
|
" VALUES ($1, 'blacklist', $2)"
|
||||||
|
)
|
||||||
|
await conn.executemany(sql, values)
|
||||||
|
|
||||||
|
# add whitelist entries
|
||||||
|
values1 = []
|
||||||
|
values2 = []
|
||||||
|
whitelist = _load_list(config['config_dir'], 'white')
|
||||||
|
for base_url in whitelist:
|
||||||
|
durl = await Durl(base_url)
|
||||||
|
if durl:
|
||||||
|
url = durl.site()
|
||||||
|
if url not in blacklist:
|
||||||
|
values1.append((url, {'source': 'seed file'}))
|
||||||
|
values2.append((url,))
|
||||||
|
sql = (
|
||||||
|
"INSERT INTO site_annotation (base_url, ann_type, ann_content)"
|
||||||
|
" VALUES ($1, 'whitelist', $2)"
|
||||||
|
)
|
||||||
|
await conn.executemany(sql, values1)
|
||||||
|
sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)"
|
||||||
|
await conn.executemany(sql, values2)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_list(config_dir, black_white):
|
||||||
|
"""
|
||||||
|
Load the seed black or white list.
|
||||||
|
"""
|
||||||
|
path = Path(config_dir) / 'initial_data' / f'seed_urls.list'
|
||||||
|
with open(path, 'r') as list_file:
|
||||||
|
urls = []
|
||||||
|
for line in list_file.read().strip().splitlines():
|
||||||
|
line_ = line.strip()
|
||||||
|
if line_.startswith('#'):
|
||||||
|
continue
|
||||||
|
if black_white == 'black' and line_.startswith('-'):
|
||||||
|
urls.append(line_[1:].strip())
|
||||||
|
if black_white == 'white' and line_.startswith('+'):
|
||||||
|
urls.append(line_[1:].strip())
|
||||||
|
return urls
|
69
src/atextcrawler/tensorflow.py
Normal file
69
src/atextcrawler/tensorflow.py
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
"""
|
||||||
|
Query the tensorflow_model_server's REST API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class TensorFlow:
|
||||||
|
"""
|
||||||
|
Fetch an embedding vector from the tensorflow model server.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
app,
|
||||||
|
session: aiohttp.ClientSession,
|
||||||
|
timeout_sock_connect: Union[int, float] = 0.5,
|
||||||
|
timeout_sock_read: Union[int, float] = 10,
|
||||||
|
):
|
||||||
|
self.config = app.config['tensorflow']
|
||||||
|
self.session = session
|
||||||
|
self.timeout = aiohttp.ClientTimeout(
|
||||||
|
sock_connect=timeout_sock_connect, sock_read=timeout_sock_read
|
||||||
|
)
|
||||||
|
|
||||||
|
async def embed(
|
||||||
|
self, text: Union[str, list[str]]
|
||||||
|
) -> Optional[Union[list[float], list[list[float]]]]:
|
||||||
|
"""
|
||||||
|
Query the tensorflow_model_server's REST API for a prediction.
|
||||||
|
|
||||||
|
Take a string or a list of strings and return an embedding vector
|
||||||
|
or a list of embedding vectors.
|
||||||
|
|
||||||
|
If the request fails or times out, return None.
|
||||||
|
"""
|
||||||
|
text_ = text if isinstance(text, list) else [text]
|
||||||
|
data = {'signature_name': 'serving_default', 'instances': text_}
|
||||||
|
try:
|
||||||
|
async with self.session.post(
|
||||||
|
self.config['model_server_endpoint'],
|
||||||
|
json=data,
|
||||||
|
timeout=self.timeout,
|
||||||
|
) as resp:
|
||||||
|
try:
|
||||||
|
res = await resp.json()
|
||||||
|
if isinstance(text, list):
|
||||||
|
return res.get('predictions')
|
||||||
|
else:
|
||||||
|
return res.get('predictions')[0]
|
||||||
|
except:
|
||||||
|
msg = 'Got invalid response from tensorflow'
|
||||||
|
logger.error(msg)
|
||||||
|
return None
|
||||||
|
except Exception as err:
|
||||||
|
msg = 'Could not get embedding from tensorflow for '
|
||||||
|
if isinstance(text, str):
|
||||||
|
msg += f'string of length {len(text)}'
|
||||||
|
else:
|
||||||
|
msg += 'list of strings with lengths '
|
||||||
|
msg += ','.join([str(len(s)) for s in text])
|
||||||
|
msg += f', reason: {err}'
|
||||||
|
logger.error(msg)
|
||||||
|
return None
|
0
src/atextcrawler/utils/__init__.py
Normal file
0
src/atextcrawler/utils/__init__.py
Normal file
481
src/atextcrawler/utils/annotation.py
Normal file
481
src/atextcrawler/utils/annotation.py
Normal file
|
@ -0,0 +1,481 @@
|
||||||
|
"""
|
||||||
|
Convert html to plain text with annotations over character ranges.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
|
from .json import json_dumps, json_loads
|
||||||
|
from .link import nofollow_link_rels
|
||||||
|
from .tag import keep_tags, self_closing_tags
|
||||||
|
|
||||||
|
MAX_HREF_LENGTH = 200
|
||||||
|
"""
|
||||||
|
Maximum length of an href. Other links are discarded.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
text_blacklist = [
|
||||||
|
'previous',
|
||||||
|
'next',
|
||||||
|
'back', # common pagination navigation
|
||||||
|
'↩︎', # amusewiki footnote separator (after conversion from muse to html)
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
Texts to ignore.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotatingParser(HTMLParser):
|
||||||
|
"""
|
||||||
|
Parse tagged text resulting in pure text and annotations.
|
||||||
|
|
||||||
|
The text is available in self.text and the annotations
|
||||||
|
in self.annotations, which is a dict with these keys:
|
||||||
|
|
||||||
|
* tags: contains a mapping of offset ranges (i, f) to
|
||||||
|
the tags opening at i and closing at f
|
||||||
|
* semantic_breaks: a mapping of offset positions where
|
||||||
|
a new section begins to the nesting level of that
|
||||||
|
sections; a section is whereever an (opening or closing)
|
||||||
|
separating tag is placed in the raw html; for the
|
||||||
|
separating flag of tags see tag.py
|
||||||
|
* links: a mapping of hrefs to link texts obtained from
|
||||||
|
anchor (a) tags; we skip hyperref with nofollow rels
|
||||||
|
* section_ids: map an offset position to the first
|
||||||
|
id attribute (of any tag) at the beginning of a
|
||||||
|
semantic section; this can later be used in a URL
|
||||||
|
fragment for linking directly into this section
|
||||||
|
|
||||||
|
Internally, we put opening tags on self.stack and pop them
|
||||||
|
when the first matching closing tag is encountered. We assume
|
||||||
|
balanced tags (tidy html).
|
||||||
|
|
||||||
|
NB: all tags with semantic breaks have sep=True, i.e.,
|
||||||
|
they will have spaces around them so that the semantic breaks
|
||||||
|
always sit on a space; the semantic break position p is the end
|
||||||
|
of the last section and the next sections begins at p + 1.
|
||||||
|
|
||||||
|
The text alway begins with a ' ' (added if not in the original),
|
||||||
|
which is assigned a semantic break with default level 80
|
||||||
|
(if there is no semantic break tag at the beginning).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.text = ' ' # concatenated text data (without tags)
|
||||||
|
self.pos = 1 # equal to len(self.text)
|
||||||
|
self.stack = []
|
||||||
|
self.tags = defaultdict(dict)
|
||||||
|
self.semantic_breaks = {0: 80}
|
||||||
|
self.tag_id = None
|
||||||
|
self.section_ids = defaultdict(list)
|
||||||
|
self.links = {}
|
||||||
|
self.add_space = False
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""
|
||||||
|
Finish by collecting results in dict `self.annotations`.
|
||||||
|
"""
|
||||||
|
super().close()
|
||||||
|
self.annotations = {}
|
||||||
|
self.annotations['links'] = self.links
|
||||||
|
self.annotations['semantic_breaks'] = {
|
||||||
|
pos: lvl for pos, lvl in sorted(self.semantic_breaks.items())
|
||||||
|
}
|
||||||
|
self.annotations['tags'] = self.tags
|
||||||
|
self.annotations['section_ids'] = self.section_ids
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
"""
|
||||||
|
Called for each opening tag.
|
||||||
|
"""
|
||||||
|
sep, lvl, sem = keep_tags[tag]
|
||||||
|
attrs = dict(attrs)
|
||||||
|
if sep:
|
||||||
|
self.add_space = True
|
||||||
|
if tag == 'section' and 'endnotes' in attrs.get('role', ''):
|
||||||
|
lvl = 25
|
||||||
|
# ARIA roles
|
||||||
|
if role := attrs.get('role'):
|
||||||
|
if role == 'article':
|
||||||
|
lvl = 15
|
||||||
|
elif role == 'heading':
|
||||||
|
if aria_level := attrs.get('aria-level'):
|
||||||
|
if aria_level in (1, 2, 3, 4, 5, 6):
|
||||||
|
sep, lvl, sem = keep_tags[f'h{aria_level}']
|
||||||
|
elif role == 'region':
|
||||||
|
lvl = 24
|
||||||
|
i = self.pos
|
||||||
|
if tag in self_closing_tags:
|
||||||
|
# self-closing tags will not be added to the result tags,
|
||||||
|
# they only appear in semantic_breaks
|
||||||
|
# the two self-closing tags br and hr both have lvl and sep
|
||||||
|
if i == 1: # replace the default semantic break at pos 0
|
||||||
|
i = 0
|
||||||
|
self.add_semantic_break(i, lvl)
|
||||||
|
i += 1
|
||||||
|
if tag_id := attrs.get('id'):
|
||||||
|
self.tag_id = i, tag_id
|
||||||
|
self.add_tag_id(i) # br or hr may have an id, too
|
||||||
|
self.add_space = True
|
||||||
|
else:
|
||||||
|
self.stack.append((i, tag, sep, lvl, sem, attrs))
|
||||||
|
# forget outdated tag id at new semantic break
|
||||||
|
if lvl:
|
||||||
|
self.forget_tag_id()
|
||||||
|
# memorize tag id
|
||||||
|
if not self.tag_id and (tag_id := attrs.get('id')):
|
||||||
|
self.tag_id = self.pos, tag_id
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
"""
|
||||||
|
Called for each closing tag.
|
||||||
|
"""
|
||||||
|
if not self.stack or (self.stack and self.stack[-1][1] != tag):
|
||||||
|
return # nothing to do for an already closed self-closing tag
|
||||||
|
i, tag_, sep, lvl, sem, attrs = self.stack.pop()
|
||||||
|
f = self.pos
|
||||||
|
# omit tag without content
|
||||||
|
if i == f:
|
||||||
|
return
|
||||||
|
# for a closing div tag revise lvl to minimum level of contained
|
||||||
|
# semantic breaks (if any)
|
||||||
|
if tag == 'div':
|
||||||
|
min_lvl = 101
|
||||||
|
for pos_, lvl_ in reversed(self.semantic_breaks.items()):
|
||||||
|
if pos_ <= i:
|
||||||
|
break
|
||||||
|
min_lvl = min(min_lvl, lvl_)
|
||||||
|
if min_lvl < 101:
|
||||||
|
lvl = min_lvl
|
||||||
|
# add semantic break and an optional section_id
|
||||||
|
if lvl:
|
||||||
|
if i == 1: # replace the default semantic break at pos 0
|
||||||
|
i = 0
|
||||||
|
if tag in ('ul', 'ol', 'li'):
|
||||||
|
seen_tags = [x[1] for x in self.stack]
|
||||||
|
if 'p' not in seen_tags:
|
||||||
|
lvl = 52 + seen_tags.count('tag')
|
||||||
|
if tag == 'li':
|
||||||
|
lvl += 1
|
||||||
|
self.add_semantic_break(i, lvl)
|
||||||
|
self.add_tag_id(i)
|
||||||
|
# do not include surrounding spaces in tag span
|
||||||
|
if self.text[i] == ' ':
|
||||||
|
i += 1
|
||||||
|
# add tag
|
||||||
|
self.tags[(i, f)][tag] = sem
|
||||||
|
# add space (when handling next data)
|
||||||
|
if sep:
|
||||||
|
self.add_space = True
|
||||||
|
# collect links
|
||||||
|
if tag == 'a':
|
||||||
|
self.extract_link(i, attrs)
|
||||||
|
|
||||||
|
def handle_data(self, text):
|
||||||
|
"""
|
||||||
|
Called for each non-tag content between tags.
|
||||||
|
"""
|
||||||
|
# handle empty or blacklisted text
|
||||||
|
if text == '':
|
||||||
|
return
|
||||||
|
if text == ' ':
|
||||||
|
self.add_space = True
|
||||||
|
return
|
||||||
|
if text.strip().lower() in text_blacklist:
|
||||||
|
if ' ' in text:
|
||||||
|
self.add_space = True
|
||||||
|
return
|
||||||
|
# add a space (at self.pos) if the text begins with one
|
||||||
|
# or if we shall add one
|
||||||
|
startswith_space = text.startswith(' ')
|
||||||
|
text = text.lstrip()
|
||||||
|
if startswith_space or self.add_space:
|
||||||
|
if self.text[-1] != ' ':
|
||||||
|
self.text += ' '
|
||||||
|
self.pos += 1
|
||||||
|
self.add_space = False
|
||||||
|
# strip a space at the end of text and handle it in end tag
|
||||||
|
if text.endswith(' '):
|
||||||
|
text = text[:-1]
|
||||||
|
self.add_space = True
|
||||||
|
# add text to self.text
|
||||||
|
self.text += text
|
||||||
|
self.pos += len(text)
|
||||||
|
|
||||||
|
def add_semantic_break(self, pos, lvl):
|
||||||
|
"""
|
||||||
|
Add a semantic break of level *lvl* at position *pos*.
|
||||||
|
"""
|
||||||
|
if pos in self.semantic_breaks:
|
||||||
|
self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl)
|
||||||
|
else:
|
||||||
|
self.semantic_breaks[pos] = lvl
|
||||||
|
|
||||||
|
def forget_tag_id(self):
|
||||||
|
"""
|
||||||
|
Reset a tag id if it is too far behind in the text stream.
|
||||||
|
"""
|
||||||
|
if self.tag_id:
|
||||||
|
pos_, tag_id = self.tag_id
|
||||||
|
if pos_ + 200 < self.pos:
|
||||||
|
self.tag_id = None
|
||||||
|
|
||||||
|
def add_tag_id(self, pos):
|
||||||
|
"""
|
||||||
|
Add and clear an id if the just closing section has none yet.
|
||||||
|
|
||||||
|
*pos* is the start position of the current section, and the
|
||||||
|
position where the id will be added.
|
||||||
|
|
||||||
|
Add an id only if we are not too far in the section's text already.
|
||||||
|
"""
|
||||||
|
if self.tag_id:
|
||||||
|
pos_, tag_id = self.tag_id
|
||||||
|
if pos_ < pos + 100 and pos not in self.section_ids:
|
||||||
|
self.section_ids[pos].append(tag_id.lower())
|
||||||
|
self.tag_id = None
|
||||||
|
|
||||||
|
def extract_link(self, i, attrs):
|
||||||
|
"""
|
||||||
|
Add a link covering character range (i, self.pos).
|
||||||
|
|
||||||
|
From html *attrs* extract href and rel.
|
||||||
|
"""
|
||||||
|
if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow':
|
||||||
|
if href.startswith('#'):
|
||||||
|
return
|
||||||
|
if len(href) > MAX_HREF_LENGTH:
|
||||||
|
return
|
||||||
|
attrs.get('title', '')
|
||||||
|
if rel := attrs.get('rel'):
|
||||||
|
if set(rel) & nofollow_link_rels:
|
||||||
|
return
|
||||||
|
self.links[href] = i, self.pos, rel
|
||||||
|
|
||||||
|
|
||||||
|
def annotate(html):
|
||||||
|
"""
|
||||||
|
Split html text into plain text with annotations (from AnnotatingParser).
|
||||||
|
"""
|
||||||
|
parser = AnnotatingParser()
|
||||||
|
parser.reset()
|
||||||
|
parser.feed(html)
|
||||||
|
parser.close()
|
||||||
|
return parser.text, parser.annotations
|
||||||
|
|
||||||
|
|
||||||
|
re_footnote = re.compile(r'^\s*\[\d+\]\s+')
|
||||||
|
|
||||||
|
|
||||||
|
def headline_probability(text, tags, lvl) -> float:
|
||||||
|
"""
|
||||||
|
Estimate the probability that the text with tags is a headline.
|
||||||
|
|
||||||
|
The context is not considered: The question is not whether the
|
||||||
|
text is a headline for the following text.
|
||||||
|
"""
|
||||||
|
text = text.strip()
|
||||||
|
res = 0.0
|
||||||
|
if not text:
|
||||||
|
return res
|
||||||
|
if lvl < 60:
|
||||||
|
return 1.0
|
||||||
|
# if 'h1' in tags or 'h2' in tags or 'h3' in tags or\
|
||||||
|
# 'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags:
|
||||||
|
# return 1.0
|
||||||
|
if len(text) < 80:
|
||||||
|
res = 0.7
|
||||||
|
else:
|
||||||
|
res = 0.7 - 0.7 * (len(text) - 80) / 200
|
||||||
|
if 'p' in tags:
|
||||||
|
res -= 0.4
|
||||||
|
if 'em' in tags:
|
||||||
|
res += 0.3
|
||||||
|
if 'a' in tags:
|
||||||
|
res -= 0.1
|
||||||
|
if text[-1] in '.:':
|
||||||
|
res -= 0.3
|
||||||
|
res -= 0.1 * text.count(', ')
|
||||||
|
if re_footnote.match(text):
|
||||||
|
res -= 0.4
|
||||||
|
return max(res, 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]:
|
||||||
|
"""
|
||||||
|
Return the info on the share of characters covered with one of the *tags*.
|
||||||
|
|
||||||
|
Only consider the characters between i and f of string *text*.
|
||||||
|
|
||||||
|
Return the number of tags that have an overlap in the specified region,
|
||||||
|
the tag density in the region (fraction of covered characters by all),
|
||||||
|
and the average number of covered chars per tag.
|
||||||
|
|
||||||
|
NB: If more than one tag name is given, then the fractional share
|
||||||
|
may exceed 1.
|
||||||
|
"""
|
||||||
|
if i == f:
|
||||||
|
return 0, 0.0, 0.0
|
||||||
|
tag_count = 0
|
||||||
|
covered_chars = 0
|
||||||
|
for (s_i, s_f), anns in tags.items():
|
||||||
|
if overlap := range_overlap(i, f - 1, s_i, s_f - 1):
|
||||||
|
for ann in anns:
|
||||||
|
if ann in tag_names:
|
||||||
|
tag_count += 1
|
||||||
|
covered_chars += overlap[1] - overlap[0]
|
||||||
|
all_chars = f - i
|
||||||
|
tag_density = covered_chars * 1.0 / all_chars
|
||||||
|
avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0
|
||||||
|
return tag_count, tag_density, avg_text_len
|
||||||
|
|
||||||
|
|
||||||
|
def range_overlap(i1, f1, i2, f2):
|
||||||
|
"""
|
||||||
|
Return the overlap of both ranges (None if there is none).
|
||||||
|
"""
|
||||||
|
return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2))
|
||||||
|
|
||||||
|
|
||||||
|
def annotations_remove_section(annotations, i, f):
|
||||||
|
"""
|
||||||
|
Remove section (i, f) from annotations and return result.
|
||||||
|
"""
|
||||||
|
new_annotations = {}
|
||||||
|
d = f - i
|
||||||
|
if not d:
|
||||||
|
return annotations
|
||||||
|
|
||||||
|
# relocate tags
|
||||||
|
new_tags = {}
|
||||||
|
for (t_i, t_f), anns in annotations['tags'].items():
|
||||||
|
n_i, n_f = cut_range(i, f, d, t_i, t_f)
|
||||||
|
if n_i is not None:
|
||||||
|
new_tags[(n_i, n_f)] = anns
|
||||||
|
new_annotations['tags'] = new_tags
|
||||||
|
|
||||||
|
# relocate links
|
||||||
|
new_links = {}
|
||||||
|
for href, (l_i, l_f, rel) in annotations['links'].items():
|
||||||
|
n_i, n_f = cut_range(i, f, d, l_i, l_f)
|
||||||
|
if n_i is not None:
|
||||||
|
new_links[href] = n_i, n_f, rel
|
||||||
|
|
||||||
|
# relocate semantic breaks and section_ids
|
||||||
|
semantic_breaks = annotations['semantic_breaks']
|
||||||
|
section_ids = annotations['section_ids']
|
||||||
|
new_semantic_breaks = {}
|
||||||
|
new_section_ids = {}
|
||||||
|
for pos in sorted(semantic_breaks.keys()):
|
||||||
|
level = semantic_breaks[pos]
|
||||||
|
if i <= pos and pos < f:
|
||||||
|
continue # discard
|
||||||
|
elif f <= pos:
|
||||||
|
new_semantic_breaks[pos - d] = level
|
||||||
|
if pos in section_ids:
|
||||||
|
new_section_ids[pos - d] = section_ids[pos]
|
||||||
|
else:
|
||||||
|
new_semantic_breaks[pos] = level
|
||||||
|
if pos in section_ids:
|
||||||
|
new_section_ids[pos] = section_ids[pos]
|
||||||
|
|
||||||
|
# collect and return results
|
||||||
|
new_annotations['semantic_breaks'] = new_semantic_breaks
|
||||||
|
new_annotations['section_ids'] = new_section_ids
|
||||||
|
new_annotations['links'] = new_links
|
||||||
|
return new_annotations
|
||||||
|
|
||||||
|
|
||||||
|
def cut_range(i, f, d, t_i, t_f):
|
||||||
|
"""
|
||||||
|
Return the new coordinates of a text range (t_i,t_f) after cutting (i,f).
|
||||||
|
|
||||||
|
If (t_i,t_f) is fully within (i,f), return None, None.
|
||||||
|
"""
|
||||||
|
if t_f < i:
|
||||||
|
return t_i, t_f
|
||||||
|
elif t_i < i <= t_f <= f:
|
||||||
|
return t_i, i
|
||||||
|
elif t_i < i and f <= t_f:
|
||||||
|
return t_i, t_f - d
|
||||||
|
elif i <= t_i and t_f <= f:
|
||||||
|
return None, None
|
||||||
|
elif i <= t_i <= f < t_f:
|
||||||
|
return i, t_f - d
|
||||||
|
else: # f < t_i
|
||||||
|
return t_i - d, t_f - d
|
||||||
|
|
||||||
|
|
||||||
|
def clean_annotations(annotations: dict) -> None:
|
||||||
|
"""
|
||||||
|
Remove void stuff from annotations.
|
||||||
|
"""
|
||||||
|
cleaned_tags = {}
|
||||||
|
for (i, f), anns in annotations['tags'].items():
|
||||||
|
if f > i and anns:
|
||||||
|
cleaned_tags[(i, f)] = anns
|
||||||
|
annotations['tags'] = cleaned_tags
|
||||||
|
|
||||||
|
|
||||||
|
def pack_annotations(annotations):
|
||||||
|
"""
|
||||||
|
Pack annotations to a special JSON string, reducing their volume a little.
|
||||||
|
"""
|
||||||
|
return json_dumps(
|
||||||
|
{
|
||||||
|
'tags': _pack_tags(annotations['tags']),
|
||||||
|
'semantic_breaks': ','.join(
|
||||||
|
[
|
||||||
|
f'{pos}:{level}'
|
||||||
|
for pos, level in annotations['semantic_breaks'].items()
|
||||||
|
]
|
||||||
|
),
|
||||||
|
'section_ids': annotations['section_ids'],
|
||||||
|
'links': annotations['links'],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pack_tags(tags: dict) -> str:
|
||||||
|
"""
|
||||||
|
Utility function for packing tag information into a string.
|
||||||
|
"""
|
||||||
|
res = ''
|
||||||
|
for (i, f), anns in tags.items():
|
||||||
|
if anns:
|
||||||
|
anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()])
|
||||||
|
res += f'{i}-{f}:{anns_}\n'
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def unpack_annotations(json_text: str) -> dict:
|
||||||
|
"""
|
||||||
|
Unpack tag information from a string.
|
||||||
|
"""
|
||||||
|
annotations = json_loads(json_text)
|
||||||
|
tags = {}
|
||||||
|
for line in annotations['tags'].split('\n'):
|
||||||
|
if line:
|
||||||
|
range_, anns_ = line.split(':')
|
||||||
|
i, f = range_.split('-')
|
||||||
|
i = int(i)
|
||||||
|
f = int(f)
|
||||||
|
anns = {}
|
||||||
|
if anns_:
|
||||||
|
for ann_ in anns_.split(','):
|
||||||
|
tag_, sem_ = ann_.split('=')
|
||||||
|
anns[tag_] = sem_
|
||||||
|
tags[(i, f)] = anns
|
||||||
|
semantic_breaks = {}
|
||||||
|
for sb_ in annotations['semantic_breaks'].split(','):
|
||||||
|
pos_, lvl_ = sb_.split(':')
|
||||||
|
semantic_breaks[int(pos_)] = int(lvl_)
|
||||||
|
return {
|
||||||
|
'tags': tags,
|
||||||
|
'semantic_breaks': semantic_breaks,
|
||||||
|
'section_ids': annotations['section_ids'],
|
||||||
|
'links': annotations['links'],
|
||||||
|
}
|
90
src/atextcrawler/utils/date_finder.py
Normal file
90
src/atextcrawler/utils/date_finder.py
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
"""
|
||||||
|
Find date expressions in a string.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
p_day = r'(0?[1-9]|[12][0-9]|3[01])'
|
||||||
|
p_month = r'(0?[1-9]|1[0-2])'
|
||||||
|
p_year = r'(20\d\d|19\d\d)'
|
||||||
|
sep = r'\D{1,2}'
|
||||||
|
p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?'
|
||||||
|
|
||||||
|
|
||||||
|
format_re = {
|
||||||
|
'iso': (
|
||||||
|
re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'),
|
||||||
|
(1, 2, 3, 6, 7),
|
||||||
|
),
|
||||||
|
'dmy': (
|
||||||
|
re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'),
|
||||||
|
(3, 2, 1, 6, 7),
|
||||||
|
),
|
||||||
|
'mdy': (
|
||||||
|
re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'),
|
||||||
|
(3, 1, 2, 6, 7),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
lang_format = {
|
||||||
|
'de': ('iso', 'dmy'),
|
||||||
|
'en': ('iso', 'mdy'),
|
||||||
|
None: ('iso', 'dmy', 'mdy'),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]:
|
||||||
|
"""
|
||||||
|
Extract the latest date compatible with the *lang* from *text*.
|
||||||
|
|
||||||
|
Only consider dates in the past.
|
||||||
|
"""
|
||||||
|
dates = extract_dates(text, lang=lang)
|
||||||
|
return max(dates) if dates else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_dates(text: str, lang: str = None) -> list[datetime]:
|
||||||
|
"""
|
||||||
|
Extract dates form a string, optionally limiting formats to a language.
|
||||||
|
"""
|
||||||
|
dates = []
|
||||||
|
fmts = lang_format.get(lang, lang_format[None])
|
||||||
|
for fmt in fmts:
|
||||||
|
re_, slots = format_re[fmt]
|
||||||
|
matches = re_.findall(text)
|
||||||
|
if matches:
|
||||||
|
for match in matches:
|
||||||
|
try:
|
||||||
|
date = datetime(
|
||||||
|
int(match[slots[0]]),
|
||||||
|
int(match[slots[1]]),
|
||||||
|
int(match[slots[2]]),
|
||||||
|
int(match[slots[3]] or 0),
|
||||||
|
int(match[slots[4]] or 0),
|
||||||
|
)
|
||||||
|
if date <= datetime.utcnow():
|
||||||
|
dates.append(date)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return dates
|
||||||
|
|
||||||
|
|
||||||
|
## from htmldate import find_date
|
||||||
|
|
||||||
|
# def extract_last_pub(html):
|
||||||
|
# """
|
||||||
|
# Return an estimate for the time of last content publication from html.
|
||||||
|
# """
|
||||||
|
# # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported
|
||||||
|
# lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8'))
|
||||||
|
# # publication date (from startpage)
|
||||||
|
# try:
|
||||||
|
# date_string = find_date(lxml_tree)
|
||||||
|
# pd = date.fromisoformat(date_string)
|
||||||
|
# last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0)
|
||||||
|
# except:
|
||||||
|
# last_pub = None
|
||||||
|
# return last_pub
|
278
src/atextcrawler/utils/durl.py
Normal file
278
src/atextcrawler/utils/durl.py
Normal file
|
@ -0,0 +1,278 @@
|
||||||
|
"""
|
||||||
|
Hyperlink parsing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import urlsplit
|
||||||
|
|
||||||
|
import tldextract
|
||||||
|
from async_dns import types
|
||||||
|
from async_dns.resolver import ProxyResolver
|
||||||
|
from async_lru import alru_cache
|
||||||
|
|
||||||
|
from .link import in_blacklist
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
resolver = ProxyResolver(request_timeout=2)
|
||||||
|
|
||||||
|
|
||||||
|
async_dns_logger = logging.getLogger('async_dns')
|
||||||
|
async_dns_logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
|
extract = tldextract.TLDExtract(cache_dir=False)
|
||||||
|
|
||||||
|
|
||||||
|
# tldextract uses filelock; set its loglevel to warning
|
||||||
|
filelock_logger = logging.getLogger('filelock')
|
||||||
|
filelock_logger.setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
|
||||||
|
class Durl:
|
||||||
|
"""
|
||||||
|
Decomposed URL, contains :class:`urllib.parse.SplitResult`.
|
||||||
|
|
||||||
|
When constructing this class, it has to be awaited, e.g.:
|
||||||
|
|
||||||
|
my_durl = await Durl('http://www.example.com/whatever')
|
||||||
|
|
||||||
|
The given URL will be decomposed, validated and normalized.
|
||||||
|
If the URL is invalid, we return None instead of an instance.
|
||||||
|
|
||||||
|
If the given *base* is None, the URL must be absolute and
|
||||||
|
the hostname must be valid (DNS lookup).
|
||||||
|
|
||||||
|
If the given URL is not absolute, an already decomposed (and thus
|
||||||
|
valid) *base* Durl must be given; otherwise the URL is invalid.
|
||||||
|
|
||||||
|
The *base* Durl can contain a path (but no arguments or fragments),
|
||||||
|
in which case the URL - if not absolute - must begin with this path.
|
||||||
|
|
||||||
|
The scheme must be http or https. If the URL begins with '//',
|
||||||
|
'http:' is prepended.
|
||||||
|
|
||||||
|
If the hostname is longer than 90 characters, the URL is invalid.
|
||||||
|
|
||||||
|
Default port numbers (80 for http, 443 for https) are removed.
|
||||||
|
|
||||||
|
The hostname is changed to lower case. Spaces in the hostname
|
||||||
|
make the URL invalid.
|
||||||
|
|
||||||
|
URL fragments are removed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_url = None
|
||||||
|
_base = None
|
||||||
|
_match_base = False
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
base: Optional['Durl'] = None,
|
||||||
|
match_base: bool = False,
|
||||||
|
):
|
||||||
|
self._url = url
|
||||||
|
self._base = base
|
||||||
|
self._match_base = match_base
|
||||||
|
|
||||||
|
def __await__(self):
|
||||||
|
return self.__ainit__().__await__()
|
||||||
|
|
||||||
|
async def __ainit__(self):
|
||||||
|
res = None
|
||||||
|
try:
|
||||||
|
# add missing scheme for urls beginning with '//'
|
||||||
|
if self._url.startswith('//'):
|
||||||
|
self._url = 'http:' + self._url
|
||||||
|
# split the url
|
||||||
|
durl = urlsplit(self._url)
|
||||||
|
# remove default port numbers 80, 443
|
||||||
|
netloc = durl.netloc
|
||||||
|
if durl.port == 80 and durl.scheme == 'http':
|
||||||
|
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
|
||||||
|
if durl.port == 443 and durl.scheme == 'https':
|
||||||
|
netloc = netloc.removesuffix(str(durl.port)).rstrip(':')
|
||||||
|
if durl.hostname and durl.hostname != durl.netloc.lower():
|
||||||
|
user_pass = ''
|
||||||
|
if durl.username and durl.password:
|
||||||
|
user_pass = f'{durl.username}:{durl.password}@'
|
||||||
|
port = ''
|
||||||
|
if durl.port:
|
||||||
|
port = f':{durl.port}'
|
||||||
|
netloc = f'{user_pass}{durl.hostname.lower()}{port}'
|
||||||
|
durl = durl._replace(netloc=netloc)
|
||||||
|
|
||||||
|
if self._base:
|
||||||
|
# if missing fill in scheme and netloc from base
|
||||||
|
if not durl.scheme:
|
||||||
|
durl = durl._replace(scheme=self._base.scheme)
|
||||||
|
if not durl.netloc:
|
||||||
|
durl = durl._replace(netloc=self._base.netloc)
|
||||||
|
# if match_base, then set res only if the
|
||||||
|
# url is compatible with base url
|
||||||
|
if not self._match_base:
|
||||||
|
res = durl
|
||||||
|
else:
|
||||||
|
if durl.netloc == self._base.netloc:
|
||||||
|
if durl.scheme == self._base.scheme:
|
||||||
|
if self._base.path not in ('/', ''):
|
||||||
|
if durl.path.startswith(self._base.path):
|
||||||
|
res = durl
|
||||||
|
else:
|
||||||
|
res = durl
|
||||||
|
else:
|
||||||
|
res = durl
|
||||||
|
except:
|
||||||
|
logger.exception(
|
||||||
|
f'Durl init failed url={self._url}'
|
||||||
|
f' base={self._base} match_base={self._match_base}'
|
||||||
|
)
|
||||||
|
res = None
|
||||||
|
if res:
|
||||||
|
res = res._replace(fragment='')
|
||||||
|
if not res.hostname or len(res.hostname) > 90:
|
||||||
|
res = None
|
||||||
|
elif res.scheme not in ('https', 'http'):
|
||||||
|
res = None
|
||||||
|
elif ' ' in res.hostname or '.' not in res.hostname:
|
||||||
|
res = None
|
||||||
|
elif not (await get_ips(res.hostname)):
|
||||||
|
res = None
|
||||||
|
elif not res.path.startswith('/'):
|
||||||
|
res = res._replace(path='/')
|
||||||
|
if res:
|
||||||
|
if res.fragment is None:
|
||||||
|
res.fragment = ''
|
||||||
|
self._durl = res
|
||||||
|
return self
|
||||||
|
self._durl = None
|
||||||
|
|
||||||
|
def __getattr__(self, attr):
|
||||||
|
return getattr(self._durl, attr)
|
||||||
|
|
||||||
|
def url(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the URL as string.
|
||||||
|
"""
|
||||||
|
return self._durl.geturl()
|
||||||
|
|
||||||
|
def pwa(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the (base-relative) path with args of the Durl.
|
||||||
|
"""
|
||||||
|
if self._base and self._match_base:
|
||||||
|
path = self._durl.path.removeprefix(self._base.path)
|
||||||
|
else:
|
||||||
|
path = self._durl.path
|
||||||
|
qs = f'?{self._durl.query}' if self._durl.query else ''
|
||||||
|
return f'{path}{qs}'.lstrip('/')
|
||||||
|
|
||||||
|
def has_path(self) -> bool:
|
||||||
|
"""
|
||||||
|
Return whether the Durl has a non-trivil path.
|
||||||
|
"""
|
||||||
|
return self._durl.path not in ('/', '')
|
||||||
|
|
||||||
|
def site(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the site (base_url).
|
||||||
|
"""
|
||||||
|
return f'{self._durl.scheme}://{self._durl.netloc}/'
|
||||||
|
|
||||||
|
def domain(self) -> str:
|
||||||
|
"""
|
||||||
|
Return the domain of the Durl (wrong in case of second-level domains).
|
||||||
|
"""
|
||||||
|
levels = extract(self._durl.hostname)
|
||||||
|
return '.'.join(levels[-2:]).lower()
|
||||||
|
|
||||||
|
def replace_scheme(self, scheme: str) -> None:
|
||||||
|
"""
|
||||||
|
Replace the scheme (must be 'http' or 'https').
|
||||||
|
"""
|
||||||
|
self._durl = self._durl._replace(scheme=scheme)
|
||||||
|
|
||||||
|
|
||||||
|
@alru_cache(maxsize=1000)
|
||||||
|
async def get_ips(hostname: str) -> set[str]:
|
||||||
|
"""
|
||||||
|
Return IPv4 and IPv6 addresses of the given hostname.
|
||||||
|
"""
|
||||||
|
ips = set()
|
||||||
|
for type_ in (types.A, types.AAAA):
|
||||||
|
try:
|
||||||
|
res, cached = await resolver.query(hostname, type_)
|
||||||
|
if res:
|
||||||
|
if addr := res.get_record([type_]):
|
||||||
|
ips.add(addr.data)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return ips
|
||||||
|
|
||||||
|
|
||||||
|
def get_url_variants(url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Return variants of the URL.
|
||||||
|
|
||||||
|
Replace http with https and vice versa;
|
||||||
|
prepend or remove 'www.' to or from the beginning of the hostname.
|
||||||
|
"""
|
||||||
|
if url.startswith('http://www.'):
|
||||||
|
s = url.removeprefix('http://www.')
|
||||||
|
return [url, f'http://{s}', f'https://www.{s}', f'https://{s}']
|
||||||
|
elif url.startswith('http://'):
|
||||||
|
s = url.removeprefix('http://')
|
||||||
|
return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}']
|
||||||
|
elif url.startswith('https://www.'):
|
||||||
|
s = url.removeprefix('https://www.')
|
||||||
|
return [url, f'https://{s}', f'http://www.{s}', f'http://{s}']
|
||||||
|
elif url.startswith('https://'):
|
||||||
|
s = url.removeprefix('https://')
|
||||||
|
return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}']
|
||||||
|
else:
|
||||||
|
return [url]
|
||||||
|
|
||||||
|
|
||||||
|
async def assort_links(
|
||||||
|
links: dict[str, tuple[int, int, list[str]]],
|
||||||
|
durl: Durl,
|
||||||
|
text: str,
|
||||||
|
base_url: str = None,
|
||||||
|
) -> tuple[
|
||||||
|
dict[str, tuple[int, int, list[str]]],
|
||||||
|
dict[Durl, tuple[list[str], str]],
|
||||||
|
dict[Durl, tuple[list[str], str]],
|
||||||
|
]:
|
||||||
|
"""
|
||||||
|
Sort links into a cleaned, an internal and an external dict.
|
||||||
|
|
||||||
|
The cleaned dict maps absolute URLs to char ranges and relations.
|
||||||
|
The internal dict maps absolute URLs to relations and the linked text.
|
||||||
|
The external dict maps absolute URLs to relations and the linked text.
|
||||||
|
The relations are link relations, e.g. rel="canonical".
|
||||||
|
|
||||||
|
The base_url is set, it is used to distinguish internal and external
|
||||||
|
links. If it is not set, the base_url is obtained from *durl*.
|
||||||
|
"""
|
||||||
|
res_int = {}
|
||||||
|
res_ext = {}
|
||||||
|
if not base_url:
|
||||||
|
base_url = durl.site().lower()
|
||||||
|
base_durl = await Durl(base_url)
|
||||||
|
cleaned_links = {}
|
||||||
|
for href, (i, f, rel) in links.items():
|
||||||
|
durl = await Durl(href, base=base_durl)
|
||||||
|
if not durl:
|
||||||
|
continue
|
||||||
|
if durl.hostname and in_blacklist(durl.hostname):
|
||||||
|
continue
|
||||||
|
cleaned_links[durl.url()] = i, f, rel
|
||||||
|
txt = text[i:f]
|
||||||
|
if durl.site().lower() == base_url:
|
||||||
|
res_int[durl] = rel, txt
|
||||||
|
else:
|
||||||
|
res_ext[durl] = rel, txt
|
||||||
|
return cleaned_links, res_int, res_ext
|
136
src/atextcrawler/utils/html.py
Normal file
136
src/atextcrawler/utils/html.py
Normal file
|
@ -0,0 +1,136 @@
|
||||||
|
"""
|
||||||
|
Utilities for extracting information from html.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from html import unescape
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from .lang import clean_lang
|
||||||
|
from .tag import drop_roles, drop_tags, keep_tags
|
||||||
|
|
||||||
|
re_ = {
|
||||||
|
'html_lang': re.compile(
|
||||||
|
'<html[^>]*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S
|
||||||
|
),
|
||||||
|
'title': re.compile('<title[^>]*>([^<]*)</title>', re.I | re.S),
|
||||||
|
'strip': re.compile(
|
||||||
|
'<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S
|
||||||
|
),
|
||||||
|
'empty_tag': re.compile(r'<(?P<tag>\w+)( [^>]*)?>(\s*)</(?P=tag)>', re.S),
|
||||||
|
'whitespace': re.compile('(\s| )+', re.S),
|
||||||
|
'whitespace_': re.compile('\s| ?'), # allow broken  
|
||||||
|
'whitespace_near_tag': re.compile(
|
||||||
|
'\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1'
|
||||||
|
'|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*',
|
||||||
|
re.S,
|
||||||
|
),
|
||||||
|
'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S),
|
||||||
|
'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S),
|
||||||
|
'http_equiv': re.compile('(<meta [^>]*http-equiv[^>]*>)', re.I | re.S),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def whitespace_tag_tag(match_obj):
|
||||||
|
"""
|
||||||
|
Helper function for removing whitespace between tags.
|
||||||
|
"""
|
||||||
|
return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2))
|
||||||
|
|
||||||
|
|
||||||
|
def clean_html(s: Optional[str]) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Clean an html string.
|
||||||
|
|
||||||
|
Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20).
|
||||||
|
|
||||||
|
See also: https://www.lesinskis.com/python-unicode-whitespace.html
|
||||||
|
"""
|
||||||
|
return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_html_lang(html: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return the language, if any, found in the lang attribute of the html tag.
|
||||||
|
"""
|
||||||
|
m = re_['html_lang'].search(html)
|
||||||
|
return clean_lang(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_title(html: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract title tags from html returning their content as a string.
|
||||||
|
"""
|
||||||
|
if not (titles := re_['title'].findall(html)):
|
||||||
|
return None
|
||||||
|
titles = [clean_html(title) for title in reversed(titles) if title]
|
||||||
|
return ' - '.join(titles).strip(' |')
|
||||||
|
|
||||||
|
|
||||||
|
def clean_page(html):
|
||||||
|
"""
|
||||||
|
Remove unwanted tags including their content from html.
|
||||||
|
|
||||||
|
Drop tags in *drop_tags* as well as tags with a role in *drop_roles*.
|
||||||
|
Also drop tags with attribute aria-hidden=true.
|
||||||
|
|
||||||
|
Return a beautiful soup.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
for tag in drop_tags:
|
||||||
|
for n in soup.find_all(tag):
|
||||||
|
n.decompose()
|
||||||
|
for n in soup.find_all(attrs={'aria-hidden': 'true'}):
|
||||||
|
n.decompose()
|
||||||
|
for role in drop_roles:
|
||||||
|
for n in soup.find_all(attrs={'rel': role}):
|
||||||
|
n.decompose()
|
||||||
|
return soup
|
||||||
|
|
||||||
|
|
||||||
|
def clean_body(body):
|
||||||
|
"""
|
||||||
|
Clean an html body.
|
||||||
|
|
||||||
|
Remove unwanted tags (keeping their content); remove empty tags;
|
||||||
|
remove and replace whitespaces in several ways.
|
||||||
|
|
||||||
|
In the end the only whitespace is a space and there are no
|
||||||
|
consecutive spaces.
|
||||||
|
"""
|
||||||
|
body = re_['strip'].sub(' ', body)
|
||||||
|
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
|
||||||
|
body = re_['whitespace'].sub(' ', body)
|
||||||
|
while re_['empty_tag'].search(body):
|
||||||
|
body = re_['empty_tag'].sub(r'\3', body)
|
||||||
|
body = re_['whitespace_near_tag'].sub(r'<\1>', body)
|
||||||
|
body = re_['whitespace'].sub(' ', body)
|
||||||
|
body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body)
|
||||||
|
return body.strip().replace('\u00ad', '') # soft hyphen
|
||||||
|
|
||||||
|
|
||||||
|
def get_html_redirect(html: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return an html redirect in an http-equiv meta tag.
|
||||||
|
|
||||||
|
If none is found, return None.
|
||||||
|
"""
|
||||||
|
redir_url = None
|
||||||
|
http_equivs = re_['http_equiv'].findall(html)
|
||||||
|
for raw in http_equivs:
|
||||||
|
tag = BeautifulSoup(raw, 'html.parser').meta
|
||||||
|
if tag and tag.get('http-equiv', '').lower() == 'refresh':
|
||||||
|
if content := tag.get('content'):
|
||||||
|
try:
|
||||||
|
_, redir_url = content.split(';')
|
||||||
|
redir_url = (
|
||||||
|
redir_url.strip()
|
||||||
|
.removeprefix('url=')
|
||||||
|
.removeprefix('URL=')
|
||||||
|
.strip("'")
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return redir_url
|
58
src/atextcrawler/utils/http.py
Normal file
58
src/atextcrawler/utils/http.py
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
"""
|
||||||
|
Utility functions related to http.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from multidict import CIMultiDictProxy
|
||||||
|
|
||||||
|
from ..models import Site
|
||||||
|
from .durl import Durl
|
||||||
|
|
||||||
|
re_ = {
|
||||||
|
'link_header': re.compile(',\s*(?=<)'),
|
||||||
|
'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I),
|
||||||
|
'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def get_header_links(
|
||||||
|
headers: CIMultiDictProxy,
|
||||||
|
durl: Durl,
|
||||||
|
site: Optional[Site],
|
||||||
|
) -> dict[str, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Extract canonical and shortlink links from http headers.
|
||||||
|
|
||||||
|
*durl* must be the Durl of the fetched page and *site* - i fnon None -
|
||||||
|
must be the Site to which the page belongs.
|
||||||
|
|
||||||
|
Return a (default)dict with 'canonical' and 'shortlink' as keys.
|
||||||
|
The values default to None.
|
||||||
|
"""
|
||||||
|
res = {}
|
||||||
|
canonical = shortlink = None
|
||||||
|
if 'link' in headers and (link_headers := headers.getall('link')):
|
||||||
|
links = []
|
||||||
|
for link_header in link_headers:
|
||||||
|
links += re_['link_header'].split(link_header)
|
||||||
|
url = durl.url()
|
||||||
|
base_url = site.base_url if site else url
|
||||||
|
base_durl = await Durl(base_url) if base_url else None
|
||||||
|
for link in links:
|
||||||
|
if not canonical and 'canonical' in link.lower():
|
||||||
|
if re_['rel_canonical'].search(link):
|
||||||
|
canon_url = link.strip().lstrip('<').split('>')[0]
|
||||||
|
if canon_durl := await Durl(canon_url, base=base_durl):
|
||||||
|
canonical = canon_durl.url()
|
||||||
|
if not shortlink and 'shortlink' in link.lower():
|
||||||
|
if re_['rel_shortlink'].search(link):
|
||||||
|
short_url = link.strip().lstrip('<').split('>')[0]
|
||||||
|
if short_durl := await Durl(short_url, base=base_durl):
|
||||||
|
shortlink = short_durl.url()
|
||||||
|
if canonical and shortlink:
|
||||||
|
break
|
||||||
|
res['canonical'] = canonical
|
||||||
|
res['shortlink'] = shortlink
|
||||||
|
return res
|
32
src/atextcrawler/utils/json.py
Normal file
32
src/atextcrawler/utils/json.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
"""
|
||||||
|
Custom JSON encoder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class JSONEncoderExt(json.JSONEncoder):
|
||||||
|
"""
|
||||||
|
Extended JSON encoder with encoding of sets as lists.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def default(self, obj):
|
||||||
|
"""
|
||||||
|
Encode sets as lists and everything else as by default.
|
||||||
|
"""
|
||||||
|
if isinstance(obj, set):
|
||||||
|
return list(obj)
|
||||||
|
return json.JSONEncoder.default(self, obj)
|
||||||
|
|
||||||
|
|
||||||
|
def json_dumps(obj):
|
||||||
|
"""
|
||||||
|
Encode an object to a JSON string using JSONEncoderExt.
|
||||||
|
"""
|
||||||
|
return json.dumps(obj, cls=JSONEncoderExt)
|
||||||
|
|
||||||
|
|
||||||
|
json_loads = json.loads
|
||||||
|
"""
|
||||||
|
Decoding of JSON strings as by default.
|
||||||
|
"""
|
44
src/atextcrawler/utils/lang.py
Normal file
44
src/atextcrawler/utils/lang.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
"""
|
||||||
|
Utility functions related to languages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import gcld3
|
||||||
|
|
||||||
|
asset_path = Path(__file__).parent.parent / 'assets'
|
||||||
|
|
||||||
|
|
||||||
|
with open(asset_path / 'iso_639-1', 'r') as f:
|
||||||
|
iso_639_1_codes = f.read().strip().split('\n')
|
||||||
|
|
||||||
|
|
||||||
|
lang_detector = gcld3.NNetLanguageIdentifier(
|
||||||
|
min_num_bytes=0, max_num_bytes=1000
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_lang(lang: Optional[str]) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Clean a language code string: it must be an ISO 639-1 code or None.
|
||||||
|
"""
|
||||||
|
if lang is None:
|
||||||
|
return None
|
||||||
|
lang = lang[:2].lower()
|
||||||
|
if lang in iso_639_1_codes:
|
||||||
|
return lang
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_content_language(text: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Extract the language from a text.
|
||||||
|
"""
|
||||||
|
if len(text) < 10:
|
||||||
|
return None
|
||||||
|
lang = None
|
||||||
|
lang_det = lang_detector.FindLanguage(text=text)
|
||||||
|
if lang_det.is_reliable:
|
||||||
|
lang = lang_det.language[:2]
|
||||||
|
return lang
|
116
src/atextcrawler/utils/link.py
Normal file
116
src/atextcrawler/utils/link.py
Normal file
|
@ -0,0 +1,116 @@
|
||||||
|
"""
|
||||||
|
Hyperlinks (a href, link).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import tldextract
|
||||||
|
|
||||||
|
nofollow_link_rels = set(
|
||||||
|
[
|
||||||
|
'nofollow',
|
||||||
|
'search',
|
||||||
|
'noreferrer',
|
||||||
|
'noopener',
|
||||||
|
'help',
|
||||||
|
'license',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
Do not follow the hrefs in anchor tags with these values of the rel attribute.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
meta_names = (
|
||||||
|
'generator',
|
||||||
|
'lang',
|
||||||
|
'language',
|
||||||
|
'description',
|
||||||
|
'keywords',
|
||||||
|
'author',
|
||||||
|
'title',
|
||||||
|
'subject',
|
||||||
|
'revised',
|
||||||
|
'abstract',
|
||||||
|
'topic',
|
||||||
|
'summary',
|
||||||
|
'classfication',
|
||||||
|
'category',
|
||||||
|
'reply-to',
|
||||||
|
'owner',
|
||||||
|
'url',
|
||||||
|
'identifier-URL',
|
||||||
|
'geo.position',
|
||||||
|
'geo.region',
|
||||||
|
'geo.placename',
|
||||||
|
'dc.language',
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
Values of the name attribute of meta tags to keep.
|
||||||
|
|
||||||
|
See also: https://gist.github.com/lancejpollard/1978404
|
||||||
|
See also: https://github.com/joshbuchea/HEAD
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
meta_props = (
|
||||||
|
'og:site_name',
|
||||||
|
'og:locale',
|
||||||
|
'og:type',
|
||||||
|
'og:latitude',
|
||||||
|
'og:longitude',
|
||||||
|
'og:street',
|
||||||
|
'og:locality',
|
||||||
|
'og:region',
|
||||||
|
'og:postal',
|
||||||
|
'og:country',
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
Values of the property attribute of meta tags to keep.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
link_rels = set(
|
||||||
|
[
|
||||||
|
'webmention',
|
||||||
|
'pingback',
|
||||||
|
'alternate',
|
||||||
|
'canonical',
|
||||||
|
'author',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
Values of the rel attribute of link tags to keep.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def load_blacklist():
|
||||||
|
"""
|
||||||
|
Return the 10000 most popular internet domains.
|
||||||
|
"""
|
||||||
|
path = Path(__file__).parent.parent / 'assets' / 'top_1e4'
|
||||||
|
with open(path, 'r') as file:
|
||||||
|
domains = file.read().strip().splitlines()
|
||||||
|
return domains
|
||||||
|
|
||||||
|
|
||||||
|
domain_blacklist = load_blacklist()
|
||||||
|
|
||||||
|
|
||||||
|
def in_blacklist(hostname: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Return a match of host in the blacklist, or None.
|
||||||
|
"""
|
||||||
|
domain = extract_domain(hostname)
|
||||||
|
if domain in domain_blacklist:
|
||||||
|
return hostname
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_domain(hostname: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract the lower-case domain from a hostname.
|
||||||
|
"""
|
||||||
|
levels = tldextract.extract(hostname)
|
||||||
|
return '.'.join(levels[-2:]).lower()
|
120
src/atextcrawler/utils/muse.py
Normal file
120
src/atextcrawler/utils/muse.py
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
"""
|
||||||
|
Parse muse-formatted plaintext (delivered by amusewiki).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from .date_finder import extract_latest_date
|
||||||
|
from .lang import clean_lang
|
||||||
|
|
||||||
|
re_tag = re.compile(r'<[^<]+?>')
|
||||||
|
|
||||||
|
|
||||||
|
def parse_muse(text: str) -> Optional[tuple[dict, str]]:
|
||||||
|
"""
|
||||||
|
Parse a MUSE string returning meta information and the text body.
|
||||||
|
"""
|
||||||
|
head, body = split_head_body(text)
|
||||||
|
if not head:
|
||||||
|
return None
|
||||||
|
meta = parse_head(head)
|
||||||
|
if not meta:
|
||||||
|
return None
|
||||||
|
return extract_muse_meta(meta, body), body
|
||||||
|
|
||||||
|
|
||||||
|
def split_head_body(text: str) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Split a MUSE string into head and body and return both.
|
||||||
|
"""
|
||||||
|
head = ''
|
||||||
|
while text.startswith('#'):
|
||||||
|
line_end = text.find('\n') + 1
|
||||||
|
head += text[:line_end]
|
||||||
|
text = text[line_end:]
|
||||||
|
return head.strip(), text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_head(text: str) -> dict:
|
||||||
|
"""
|
||||||
|
Parse a MUSE head and return a dict mapping field names to values.
|
||||||
|
"""
|
||||||
|
fields = {}
|
||||||
|
for line in text.split('\n'):
|
||||||
|
name, value = line.strip().split(' ', 1)
|
||||||
|
fields[name[1:]] = value
|
||||||
|
return fields
|
||||||
|
|
||||||
|
|
||||||
|
amusewiki_fields = [
|
||||||
|
'author',
|
||||||
|
'title',
|
||||||
|
'lang',
|
||||||
|
'LISTtitle', # reduced title for alphabetical sorting
|
||||||
|
'subtitle',
|
||||||
|
'SORTauthors', # authors separated by ';' or ',' (only for indexing)
|
||||||
|
'SORTtopics', # topics separated by ';' or ',' (only for indexing)
|
||||||
|
'date', # publication year
|
||||||
|
'pubdate', # publication datetime
|
||||||
|
'notes', # additional info (orig title, translators, credits, ...)
|
||||||
|
'source', # preferred format: "Retrieved on March 8, 2012 from {URL}"
|
||||||
|
'publisher',
|
||||||
|
'isbn',
|
||||||
|
#'rights',
|
||||||
|
'seriesname',
|
||||||
|
'seriesnumber',
|
||||||
|
#'hyphenation', # irrelevant
|
||||||
|
#'slides', # irrelevant
|
||||||
|
#'DELETED', # irrelevant
|
||||||
|
#'cover', # irrelevant
|
||||||
|
#'coverwidth', # irrelevant
|
||||||
|
#'nocoverpage', # irrelevant
|
||||||
|
#'notoc', # irrelevant
|
||||||
|
#'nofinalpage', # irrelevant
|
||||||
|
#'impressum', # irrelevant
|
||||||
|
#'continuefootnotes', # irrelevant
|
||||||
|
#'centerchapter', # irrelevant
|
||||||
|
#'centersection', # irrelevant
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
Amusewiki fields are (cf. https://amusewiki.org/library/manual)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
re_list = re.compile('[;,]')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_muse_meta(meta, body) -> dict:
|
||||||
|
"""
|
||||||
|
Extract meta information from muse header and muse body.
|
||||||
|
"""
|
||||||
|
authors = set()
|
||||||
|
if author := meta.get('author', '').strip():
|
||||||
|
authors.add(author)
|
||||||
|
if sortauthors := meta.get('SORTauthors', '').strip():
|
||||||
|
for author in re_list.split(sortauthors):
|
||||||
|
if author_ := author.strip():
|
||||||
|
authors.add(author_)
|
||||||
|
pubdate = meta.get('pubdate').strip()
|
||||||
|
pub_date: Optional[datetime] = None
|
||||||
|
if pubdate:
|
||||||
|
try:
|
||||||
|
pub_date = datetime.fromisoformat(pubdate)
|
||||||
|
except:
|
||||||
|
pub_date = extract_latest_date(pubdate)
|
||||||
|
summary = re_tag.sub('', body[:1000].split('\n\n')[0])
|
||||||
|
return {
|
||||||
|
'title': re_tag.sub('', meta.get('title', '')) or None,
|
||||||
|
'authors': authors,
|
||||||
|
'lang': clean_lang(meta.get('lang')),
|
||||||
|
'keywords': [
|
||||||
|
s.strip()
|
||||||
|
for s in re_list.split(meta.get('SORTtopics', '').strip())
|
||||||
|
if s.strip()
|
||||||
|
],
|
||||||
|
'pub_date': pub_date,
|
||||||
|
'summary': summary,
|
||||||
|
'orig_source': meta.get('source', '').strip() or None,
|
||||||
|
}
|
22
src/atextcrawler/utils/probe.py
Normal file
22
src/atextcrawler/utils/probe.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
"""
|
||||||
|
Utility functions for probing / sampling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_samples(items, n=5):
|
||||||
|
"""
|
||||||
|
Extract up to n sample elements from the the given dict or list.
|
||||||
|
|
||||||
|
If *items* is a dict return the elements from the list of keys.
|
||||||
|
"""
|
||||||
|
l = len(items)
|
||||||
|
if l <= n:
|
||||||
|
return items
|
||||||
|
poss = []
|
||||||
|
step = (l + 1) / n
|
||||||
|
for i in range(n):
|
||||||
|
pos = int(step * i)
|
||||||
|
if pos < l and (not poss or pos > poss[-1]):
|
||||||
|
poss.append(pos)
|
||||||
|
items_list = list(items)
|
||||||
|
return [items_list[pos] for pos in poss]
|
74
src/atextcrawler/utils/section.py
Normal file
74
src/atextcrawler/utils/section.py
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
"""
|
||||||
|
Operations on text sections.
|
||||||
|
|
||||||
|
Semantic breaks are character positions within a text (0-offset)
|
||||||
|
where a new section begins. More precisely, the character position
|
||||||
|
contains a space and only at the next position begins a tag that is
|
||||||
|
semantically breaking (e.g., a h1 or a br).
|
||||||
|
|
||||||
|
Each semantic break has a level, which means breaking strength.
|
||||||
|
The lower the level (e.g., h1 has a lower level than h2), the
|
||||||
|
stronger the break.
|
||||||
|
|
||||||
|
Implicitly, if position 0 has no semantic break, a semantic break
|
||||||
|
at position 0 with level 80 is added.
|
||||||
|
|
||||||
|
Semantic breaks can be used to split a text into sections.
|
||||||
|
The lower the maximum level of the semantic breaks taken into account,
|
||||||
|
the coarser the segmentation and the fewer the sections.
|
||||||
|
Each section is given the level of the semantic break at ist beginning.
|
||||||
|
|
||||||
|
From another point of view, sections have levels indicating
|
||||||
|
the segmentation depth.
|
||||||
|
|
||||||
|
The levels for html tags are defined in tag.py.
|
||||||
|
|
||||||
|
The *semantic_breaks* argument in the functions below
|
||||||
|
is a dict mapping the character position of the semantic break
|
||||||
|
to the level of a section beginning at this position
|
||||||
|
(if segmentation is done at this or a higher level).
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def iter_sections(text, semantic_breaks, max_level=59):
|
||||||
|
"""
|
||||||
|
Iterate over sections, limiting to those with a maximum level.
|
||||||
|
|
||||||
|
Yield (start_pos, end_pos, level, text).
|
||||||
|
*text* is assumed to have the first semantic break at position 0.
|
||||||
|
"""
|
||||||
|
n = len(text)
|
||||||
|
last_pos = 0
|
||||||
|
last_level = semantic_breaks.get(0, 80)
|
||||||
|
for pos, level in sorted(semantic_breaks.items()):
|
||||||
|
if level <= max_level and last_pos != pos:
|
||||||
|
yield last_pos, pos, last_level, text[last_pos + 1 : pos]
|
||||||
|
last_pos = pos
|
||||||
|
last_level = level
|
||||||
|
if last_pos < n:
|
||||||
|
yield last_pos, n, last_level, text[last_pos:]
|
||||||
|
|
||||||
|
|
||||||
|
def concat_section_texts(text, semantic_breaks, min_len=2000):
|
||||||
|
"""
|
||||||
|
Try to concat consecutive sections into chunks with a minimum length.
|
||||||
|
|
||||||
|
Yield (section_ids, combined_text).
|
||||||
|
"""
|
||||||
|
n = len(text)
|
||||||
|
last_pos = 0
|
||||||
|
section_ids = []
|
||||||
|
for section_id, pos in enumerate(semantic_breaks.keys()):
|
||||||
|
if pos >= last_pos + min_len:
|
||||||
|
if n - pos < min_len:
|
||||||
|
for id_ in [
|
||||||
|
i for i, k in enumerate(semantic_breaks.keys()) if k >= pos
|
||||||
|
]:
|
||||||
|
section_ids.append(id_)
|
||||||
|
pos = n
|
||||||
|
yield section_ids, text[last_pos:pos]
|
||||||
|
last_pos = pos
|
||||||
|
section_ids = []
|
||||||
|
section_ids.append(section_id)
|
||||||
|
if last_pos < n:
|
||||||
|
yield section_ids, text[last_pos:]
|
92
src/atextcrawler/utils/similarity.py
Normal file
92
src/atextcrawler/utils/similarity.py
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
"""
|
||||||
|
Text similarity with simhash.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from asyncpg import Connection
|
||||||
|
from simhash import Simhash, SimhashIndex
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
|
||||||
|
postgresql_bigint_offset = 9223372036854775808
|
||||||
|
"""
|
||||||
|
Subtract this number to get a PostgreSQL bigint from a 64bit int.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_features(txt: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extract features from string for use with Simhash.
|
||||||
|
"""
|
||||||
|
width = 3
|
||||||
|
txt = txt.replace(' ', '').lower()
|
||||||
|
return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))]
|
||||||
|
|
||||||
|
|
||||||
|
def simhash_to_bigint(simhash: Simhash) -> int:
|
||||||
|
"""
|
||||||
|
Convert a simhash to PostgreSQL's bigint value range.
|
||||||
|
"""
|
||||||
|
return simhash.value - postgresql_bigint_offset
|
||||||
|
|
||||||
|
|
||||||
|
def simhash_from_bigint(bigint: int) -> Simhash:
|
||||||
|
"""
|
||||||
|
Convert a simhash from PostgreSQL's bigint to a Simhash instance.
|
||||||
|
"""
|
||||||
|
return Simhash(bigint + postgresql_bigint_offset, log=logger)
|
||||||
|
|
||||||
|
|
||||||
|
def get_simhash(text: str) -> Simhash:
|
||||||
|
"""
|
||||||
|
Return the Simhash of the given text.
|
||||||
|
"""
|
||||||
|
return Simhash(get_features(text), log=logger)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex:
|
||||||
|
"""
|
||||||
|
Return a simhash index with hashes of all stored resources of the site.
|
||||||
|
"""
|
||||||
|
sql = (
|
||||||
|
"SELECT r.id, r.simhash FROM site_path sp, resource r"
|
||||||
|
" WHERE sp.site_id=$1 AND sp.resource_id=r.id"
|
||||||
|
)
|
||||||
|
rows = await conn.fetch(sql, site_id)
|
||||||
|
objs = [
|
||||||
|
(
|
||||||
|
str(row['id']),
|
||||||
|
Simhash(row['simhash'] + postgresql_bigint_offset, log=logger),
|
||||||
|
)
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
return SimhashIndex(objs, k=3, log=logger)
|
||||||
|
|
||||||
|
|
||||||
|
def create_simhash(
|
||||||
|
index: SimhashIndex,
|
||||||
|
resource_id: int,
|
||||||
|
simhash_instance: Simhash,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
Add a resource with given id and simhash to a simhash index.
|
||||||
|
|
||||||
|
Return the simhash value shifted into PostgreSQL's bigint range.
|
||||||
|
|
||||||
|
(The simhash field of the resource's database entry is not updated.)
|
||||||
|
"""
|
||||||
|
index.add(str(resource_id), simhash_instance)
|
||||||
|
return simhash_to_bigint(simhash_instance)
|
||||||
|
|
||||||
|
|
||||||
|
def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]:
|
||||||
|
"""
|
||||||
|
Return the ids of similar resources from the index.
|
||||||
|
"""
|
||||||
|
found = index.get_near_dups(simhash_inst)
|
||||||
|
if found:
|
||||||
|
return sorted([int(elem) for elem in found])
|
||||||
|
return []
|
189
src/atextcrawler/utils/tag.py
Normal file
189
src/atextcrawler/utils/tag.py
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
"""
|
||||||
|
Information collections related to html tags.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
drop_tags = [
|
||||||
|
'applet',
|
||||||
|
'area',
|
||||||
|
'audio',
|
||||||
|
'base',
|
||||||
|
'basefont',
|
||||||
|
'bdi',
|
||||||
|
'bdo',
|
||||||
|
'button',
|
||||||
|
'canvas',
|
||||||
|
'code',
|
||||||
|
'command',
|
||||||
|
'data',
|
||||||
|
'datalist',
|
||||||
|
'dir',
|
||||||
|
'embed',
|
||||||
|
'fieldset',
|
||||||
|
'figure',
|
||||||
|
'form',
|
||||||
|
'frame',
|
||||||
|
'frameset',
|
||||||
|
'iframe',
|
||||||
|
'img',
|
||||||
|
'input',
|
||||||
|
'label',
|
||||||
|
'legend',
|
||||||
|
'map',
|
||||||
|
'menuitem',
|
||||||
|
'meter',
|
||||||
|
'noframes',
|
||||||
|
'noscript',
|
||||||
|
'object',
|
||||||
|
'optgroup',
|
||||||
|
'option',
|
||||||
|
'param',
|
||||||
|
'picture',
|
||||||
|
'progress',
|
||||||
|
'rp',
|
||||||
|
'rt',
|
||||||
|
'ruby',
|
||||||
|
'samp',
|
||||||
|
'script',
|
||||||
|
'select',
|
||||||
|
'source',
|
||||||
|
'style',
|
||||||
|
'svg',
|
||||||
|
'template',
|
||||||
|
'textarea',
|
||||||
|
'track',
|
||||||
|
'var',
|
||||||
|
'video',
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
Tags to drop, including their content.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
keep_tags = {
|
||||||
|
'a': (0, 0, ''),
|
||||||
|
'abbr': (0, 0, 'st'),
|
||||||
|
'acronym': (0, 0, 'st'),
|
||||||
|
'address': (1, 0, 'm'),
|
||||||
|
'article': (1, 15, ''),
|
||||||
|
'aside': (1, 0, 'd'),
|
||||||
|
'b': (0, 0, 'st'),
|
||||||
|
'blockquote': (1, 65, 'q'),
|
||||||
|
'br': (1, 80, ''),
|
||||||
|
'caption': (1, 68, ''),
|
||||||
|
'center': (1, 50, ''),
|
||||||
|
'cite': (1, 0, 'd'),
|
||||||
|
'col': (1, 75, ''),
|
||||||
|
'colgroup': (1, 73, ''),
|
||||||
|
'dd': (1, 70, 'li'),
|
||||||
|
'del': (0, 0, 'se'),
|
||||||
|
'details': (1, 0, 'd'),
|
||||||
|
'dfn': (0, 0, 'st'),
|
||||||
|
'div': (1, 60, ''), # lvl often revised to min of contained tags
|
||||||
|
'dl': (1, 70, 'l'),
|
||||||
|
'dt': (1, 70, 'li'),
|
||||||
|
'em': (0, 0, 'st'),
|
||||||
|
'figcaption': (1, 0, ''),
|
||||||
|
'font': (0, 0, 's'),
|
||||||
|
'footer': (1, 15, ''),
|
||||||
|
'h1': (1, 30, ''),
|
||||||
|
'h2': (1, 32, ''),
|
||||||
|
'h3': (1, 34, ''),
|
||||||
|
'h4': (1, 36, ''),
|
||||||
|
'h5': (1, 38, ''),
|
||||||
|
'h6': (1, 40, ''),
|
||||||
|
'header': (1, 15, ''),
|
||||||
|
'hr': (1, 30, ''),
|
||||||
|
'i': (0, 0, 'st'),
|
||||||
|
'ins': (0, 0, 'se'),
|
||||||
|
'li': (1, 75, 'li'), # lvl revised if not inside p
|
||||||
|
'main': (1, 10, ''),
|
||||||
|
'mark': (0, 0, 's'),
|
||||||
|
'nav': (1, 0, ''), # keep for footnotes
|
||||||
|
'ol': (1, 70, 'l'), # lvl revised if not inside p
|
||||||
|
'p': (1, 60, ''),
|
||||||
|
'pre': (1, 65, 'q'),
|
||||||
|
'q': (1, 0, 'q'),
|
||||||
|
's': (0, 0, ''),
|
||||||
|
'section': (1, 24, ''),
|
||||||
|
'small': (0, 0, 'd'),
|
||||||
|
'span': (0, 0, 's'),
|
||||||
|
'strike': (0, 0, 'se'),
|
||||||
|
'strong': (0, 0, 'st'),
|
||||||
|
'sub': (0, 0, ''),
|
||||||
|
'summary': (1, 20, 'm'),
|
||||||
|
'sup': (0, 0, ''),
|
||||||
|
'table': (1, 65, ''),
|
||||||
|
'tbody': (1, 70, ''),
|
||||||
|
'td': (1, 78, ''),
|
||||||
|
'tfoot': (1, 70, ''),
|
||||||
|
'th': (1, 75, ''),
|
||||||
|
'thead': (1, 70, ''),
|
||||||
|
'time': (0, 0, 'm'),
|
||||||
|
'tr': (1, 75, ''),
|
||||||
|
'u': (0, 0, 's'),
|
||||||
|
'ul': (1, 70, 'l'), # lvl revised if not inside p
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
Tags to keep for annotation, and their properties.
|
||||||
|
|
||||||
|
The properties are:
|
||||||
|
|
||||||
|
* sep: whether to separate text at both sides of the tag with a space
|
||||||
|
* lvl: structural depth level of content of this tag;
|
||||||
|
the paragraph level is 60; headings are below 60, listings above;
|
||||||
|
a div below the tag will usually have the tag's depth + 1
|
||||||
|
* sem: semantic categories: zero or more of
|
||||||
|
* s=span
|
||||||
|
* l=listing
|
||||||
|
* i=list_item
|
||||||
|
* t=term
|
||||||
|
* e=edit
|
||||||
|
* d=details
|
||||||
|
* q=quote
|
||||||
|
* m=meta
|
||||||
|
* x=exclude
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
self_closing_tags = ('br', 'hr')
|
||||||
|
"""
|
||||||
|
Those among keep_tags which are self-closing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
all_self_closing_tags = (
|
||||||
|
'area',
|
||||||
|
'base',
|
||||||
|
'br',
|
||||||
|
'col',
|
||||||
|
'embed',
|
||||||
|
'hr',
|
||||||
|
'img',
|
||||||
|
'input',
|
||||||
|
'link',
|
||||||
|
'meta',
|
||||||
|
'param',
|
||||||
|
'source',
|
||||||
|
'track',
|
||||||
|
'wbr',
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
All self-closing tags of the html standard.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
drop_roles = (
|
||||||
|
'banner',
|
||||||
|
'complementary',
|
||||||
|
'contentinfo',
|
||||||
|
'dialog',
|
||||||
|
'figure',
|
||||||
|
'form',
|
||||||
|
'img',
|
||||||
|
'search',
|
||||||
|
'switch',
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
Drop tags with these aria roles.
|
||||||
|
"""
|
7
tests/__init__.py
Normal file
7
tests/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
from .annotation import AnnotateTest
|
||||||
|
from .date_finder import DateFinderTest
|
||||||
|
from .page import PageCleanTest
|
||||||
|
from .section import IterSectionTest, AggSectionTest
|
||||||
|
from .simhash import SimhashTest
|
||||||
|
from .text import CleanHtmlTest
|
||||||
|
from .durl import DurlTest
|
49
tests/annotation.py
Normal file
49
tests/annotation.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
"""
|
||||||
|
Test cases for resource type page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from atextcrawler.utils.annotation import annotate
|
||||||
|
|
||||||
|
|
||||||
|
class AnnotateTest(TestCase):
|
||||||
|
"""
|
||||||
|
Test annotation.
|
||||||
|
|
||||||
|
Consider that the <br> and <hr> tags are self-closing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_annotate_1(self):
|
||||||
|
s = '<em>Hello</em><br><strong>world</strong>'
|
||||||
|
text, anns = annotate(s)
|
||||||
|
self.assertEqual(text, ' Hello world')
|
||||||
|
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
|
||||||
|
self.assertEqual(anns['section_ids'], {})
|
||||||
|
|
||||||
|
def test_annotate_2(self):
|
||||||
|
s = '<em> Hello </em><br><strong> world </strong>'
|
||||||
|
text, anns = annotate(s)
|
||||||
|
self.assertEqual(text, ' Hello world')
|
||||||
|
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
|
||||||
|
self.assertEqual(anns['section_ids'], {})
|
||||||
|
|
||||||
|
def test_annotate_3(self):
|
||||||
|
s = '<p> Hello <em>world</em> </p> '
|
||||||
|
text, anns = annotate(s)
|
||||||
|
self.assertEqual(text, ' Hello world')
|
||||||
|
self.assertEqual(anns['semantic_breaks'], {0: 60})
|
||||||
|
|
||||||
|
def test_annotate_4(self):
|
||||||
|
s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
|
||||||
|
text, anns = annotate(s)
|
||||||
|
self.assertEqual(text, ' Hello world')
|
||||||
|
self.assertEqual(anns['semantic_breaks'], {0: 60})
|
||||||
|
self.assertEqual(anns['section_ids'], {0: ['ref1']})
|
||||||
|
|
||||||
|
def test_annotate_5(self):
|
||||||
|
s = '<br id="ref2"> Hello <p>world </p> '
|
||||||
|
text, anns = annotate(s)
|
||||||
|
self.assertEqual(text, ' Hello world')
|
||||||
|
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
|
||||||
|
self.assertEqual(anns['section_ids'], {1: ['ref2']})
|
20
tests/date_finder.py
Normal file
20
tests/date_finder.py
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from atextcrawler.utils.date_finder import extract_latest_date
|
||||||
|
|
||||||
|
|
||||||
|
class DateFinderTest(TestCase):
|
||||||
|
def test_extract_latest_date(self):
|
||||||
|
s = 'test 1987-2+1-no'
|
||||||
|
r = datetime(1987, 2, 1)
|
||||||
|
self.assertEqual(extract_latest_date(s), r)
|
||||||
|
s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
|
||||||
|
r = datetime(2020, 4, 6)
|
||||||
|
self.assertEqual(extract_latest_date(s, lang='de'), r)
|
||||||
|
s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
|
||||||
|
r = datetime(2021, 1, 20)
|
||||||
|
self.assertEqual(extract_latest_date(s, lang='en'), r)
|
||||||
|
s = ''
|
||||||
|
r = None
|
||||||
|
self.assertEqual(extract_latest_date(s), r)
|
68
tests/durl.py
Normal file
68
tests/durl.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
from unittest import IsolatedAsyncioTestCase
|
||||||
|
import asyncpg
|
||||||
|
from atextcrawler.utils.durl import Durl
|
||||||
|
from atextcrawler.config import Config
|
||||||
|
from atextcrawler.db import PGPool
|
||||||
|
|
||||||
|
|
||||||
|
class DurlTest(IsolatedAsyncioTestCase):
|
||||||
|
async def asyncSetUp(self):
|
||||||
|
config = Config().get()
|
||||||
|
self.pool = PGPool(config['postgresql'])
|
||||||
|
await self.pool.__aenter__()
|
||||||
|
self.conn = await self.pool.pool.acquire()
|
||||||
|
|
||||||
|
async def test_durl_basic(self):
|
||||||
|
durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
|
||||||
|
self.assertEqual(durl1.scheme, 'https')
|
||||||
|
self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
|
||||||
|
self.assertEqual(durl1.port, 8000)
|
||||||
|
self.assertEqual(durl1.path, '/hello')
|
||||||
|
self.assertEqual(durl1.fragment, '')
|
||||||
|
self.assertEqual(durl1.pwa(), 'hello?world')
|
||||||
|
self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
|
||||||
|
self.assertEqual(
|
||||||
|
durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
|
||||||
|
)
|
||||||
|
self.assertEqual(durl1.has_path(), True)
|
||||||
|
durl2 = await Durl('http://www.example.com/')
|
||||||
|
self.assertEqual(durl2.has_path(), False)
|
||||||
|
durl3 = await Durl('ftp://www.example.com/')
|
||||||
|
self.assertEqual(durl3, None)
|
||||||
|
|
||||||
|
async def test_durl_with_base(self):
|
||||||
|
durl1 = await Durl('https://www.example.com')
|
||||||
|
self.assertEqual(durl1.path, '/')
|
||||||
|
self.assertEqual(durl1.pwa(), '')
|
||||||
|
self.assertEqual(durl1.has_path(), False)
|
||||||
|
durl2 = await Durl('https://www.example.com/hello2', base=durl1)
|
||||||
|
self.assertEqual(durl2.hostname, 'www.example.com')
|
||||||
|
self.assertEqual(durl2.path, '/hello2')
|
||||||
|
self.assertEqual(durl2.pwa(), 'hello2')
|
||||||
|
durl3 = await Durl('/hello3?x=1', base=durl1)
|
||||||
|
self.assertEqual(durl3.hostname, 'www.example.com')
|
||||||
|
self.assertEqual(durl3.path, '/hello3')
|
||||||
|
self.assertEqual(durl3.pwa(), 'hello3?x=1')
|
||||||
|
self.assertEqual(durl3.site(), 'https://www.example.com/')
|
||||||
|
durl4 = await Durl('https://www.kernel.org/', base=durl1)
|
||||||
|
self.assertEqual(durl4, None)
|
||||||
|
|
||||||
|
async def test_durl_with_base_and_match_base(self):
|
||||||
|
durl1 = await Durl('https://www.example.com/base/path/')
|
||||||
|
self.assertEqual(durl1.path, '/base/path/')
|
||||||
|
self.assertEqual(durl1.pwa(), 'base/path/')
|
||||||
|
self.assertEqual(durl1.has_path(), True)
|
||||||
|
durl2 = await Durl(
|
||||||
|
'https://www.example.com/base/', base=durl1, match_base=True
|
||||||
|
)
|
||||||
|
self.assertEqual(durl2, None)
|
||||||
|
durl3 = await Durl(
|
||||||
|
'https://www.example.com/base/path/whatever?x=1#a',
|
||||||
|
base=durl1,
|
||||||
|
match_base=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(durl3.pwa(), 'whatever?x=1')
|
||||||
|
|
||||||
|
async def asyncTearDown(self):
|
||||||
|
await self.pool.pool.release(self.conn)
|
||||||
|
await self.pool.pool.close()
|
24
tests/page.py
Normal file
24
tests/page.py
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
"""
|
||||||
|
Test cases for resource type page.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unittest import TestCase
|
||||||
|
from atextcrawler.utils.html import clean_body
|
||||||
|
|
||||||
|
# from atextcrawler.utils.tag import drop_tags
|
||||||
|
|
||||||
|
|
||||||
|
class PageCleanTest(TestCase):
|
||||||
|
def test_clean_body_1(self):
|
||||||
|
s = ' <em>Hello</em> <strong>world</strong> '
|
||||||
|
r = '<em>Hello</em> <strong>world</strong>'
|
||||||
|
self.assertEqual(clean_body(s), r)
|
||||||
|
|
||||||
|
|
||||||
|
# def test_drop_tags(self):
|
||||||
|
# s = '<figure what="ever">something<figure>else</figure>...</figure>'
|
||||||
|
# r = drop_tags(s)
|
||||||
|
# self.assertEqual(r, '')
|
||||||
|
# s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
|
||||||
|
# r = drop_tags(s)
|
||||||
|
# self.assertEqual(r, '')
|
105
tests/section.py
Normal file
105
tests/section.py
Normal file
|
@ -0,0 +1,105 @@
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from atextcrawler.utils.section import concat_section_texts, iter_sections
|
||||||
|
|
||||||
|
|
||||||
|
class IterSectionTest(TestCase):
|
||||||
|
def test_iter_sections_1(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 80, 5: 2, 15: 1, 20: 3}
|
||||||
|
sections1 = list(iter_sections(s, sb, max_level=100))
|
||||||
|
sections2 = [
|
||||||
|
(0, 5, 80, 'bcde'),
|
||||||
|
(5, 15, 2, 'ghijklmno'),
|
||||||
|
(15, 20, 1, 'qrst'),
|
||||||
|
(20, 26, 3, 'uvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_iter_sections_2(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
|
||||||
|
sections1 = list(iter_sections(s, sb, max_level=100))
|
||||||
|
sections2 = [
|
||||||
|
(0, 5, 4, 'bcde'),
|
||||||
|
(5, 15, 2, 'ghijklmno'),
|
||||||
|
(15, 20, 1, 'qrst'),
|
||||||
|
(20, 26, 3, 'vwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_iter_sections_3(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {5: 2, 15: 60, 18: 50, 20: 3}
|
||||||
|
sections1 = list(iter_sections(s, sb, max_level=59))
|
||||||
|
sections2 = [
|
||||||
|
(0, 5, 80, 'bcde'),
|
||||||
|
(5, 18, 2, 'ghijklmnopqr'),
|
||||||
|
(18, 20, 50, 't'),
|
||||||
|
(20, 26, 3, 'uvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_iter_sections_4(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
|
||||||
|
sections1 = list(iter_sections(s, sb, max_level=59))
|
||||||
|
sections2 = [
|
||||||
|
(0, 5, 80, 'bcde'),
|
||||||
|
(5, 18, 2, 'ghijklmnopqr'),
|
||||||
|
(18, 20, 50, 't'),
|
||||||
|
(20, 26, 3, 'uvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
|
||||||
|
class AggSectionTest(TestCase):
|
||||||
|
def test_concat_sections_1(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 1, 5: 1, 15: 1, 20: 1}
|
||||||
|
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||||
|
sections2 = [
|
||||||
|
([0, 1], 'abcdefghijklmno'),
|
||||||
|
([2, 3], 'pqrstuvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_concat_sections_2(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
|
||||||
|
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||||
|
sections2 = [
|
||||||
|
([0, 1], 'abcdefghij'),
|
||||||
|
([2, 3, 4], 'klmnopqrstuvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_concat_sections_3(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
|
||||||
|
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||||
|
sections2 = [
|
||||||
|
([0, 1, 2], 'abcdefghijklmnop'),
|
||||||
|
([3, 4], 'qrstuvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_concat_sections_4(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 1, 5: 1, 15: 1, 26: 1}
|
||||||
|
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||||
|
sections2 = [
|
||||||
|
([0, 1], 'abcdefghijklmno'),
|
||||||
|
([2, 3], 'pqrstuvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
||||||
|
|
||||||
|
def test_concat_sections_5(self):
|
||||||
|
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||||
|
sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
|
||||||
|
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||||
|
sections2 = [
|
||||||
|
([0, 1], 'abcdefghijkl'),
|
||||||
|
([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
|
||||||
|
]
|
||||||
|
self.assertEqual(sections1, sections2)
|
54
tests/simhash.py
Normal file
54
tests/simhash.py
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
"""
|
||||||
|
Test cases for text util.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unittest import TestCase
|
||||||
|
from simhash import Simhash, SimhashIndex
|
||||||
|
from atextcrawler.utils.similarity import (
|
||||||
|
create_simhash,
|
||||||
|
get_features,
|
||||||
|
get_simhash,
|
||||||
|
postgresql_bigint_offset,
|
||||||
|
search_simhash,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SimhashTest(TestCase):
|
||||||
|
"""
|
||||||
|
Test simhash creation and search.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_search(self):
|
||||||
|
n1 = int('1111111100000000', 2)
|
||||||
|
n2 = int('1111111100000111', 2)
|
||||||
|
n3 = int('1000000000000000', 2)
|
||||||
|
n4 = int('1000000000000111', 2)
|
||||||
|
n5 = int('1000001111000000', 2)
|
||||||
|
objs = [
|
||||||
|
('1', Simhash(n1)),
|
||||||
|
('3', Simhash(n3)),
|
||||||
|
('4', Simhash(n4)),
|
||||||
|
]
|
||||||
|
index = SimhashIndex(objs, k=3)
|
||||||
|
found = search_simhash(index, Simhash(n5))
|
||||||
|
self.assertEqual(found, [])
|
||||||
|
found = search_simhash(index, Simhash(n1))
|
||||||
|
self.assertEqual(found, [1])
|
||||||
|
found = search_simhash(index, Simhash(n2))
|
||||||
|
self.assertEqual(found, [1])
|
||||||
|
found = search_simhash(index, Simhash(n4))
|
||||||
|
self.assertEqual(found, [3, 4])
|
||||||
|
|
||||||
|
def test_create(self):
|
||||||
|
index = SimhashIndex([], k=3)
|
||||||
|
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
|
||||||
|
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
|
||||||
|
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
|
||||||
|
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
|
||||||
|
found = search_simhash(index, simhash_1)
|
||||||
|
self.assertEqual(found, [101])
|
||||||
|
found = search_simhash(index, simhash_2)
|
||||||
|
self.assertEqual(found, [102])
|
||||||
|
simhash_3 = get_simhash('hello ' * 20 + 'X')
|
||||||
|
found = search_simhash(index, simhash_3)
|
||||||
|
self.assertEqual(found, [101])
|
65
tests/text.py
Normal file
65
tests/text.py
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
"""
|
||||||
|
Test cases for text util.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from unittest import TestCase
|
||||||
|
from atextcrawler.utils.html import clean_page
|
||||||
|
|
||||||
|
|
||||||
|
class CleanHtmlTest(TestCase):
|
||||||
|
"""
|
||||||
|
Test clean_page.
|
||||||
|
|
||||||
|
Have an eye on self-closing tags (br, hr, ...).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_clean_page_1(self):
|
||||||
|
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
|
||||||
|
r = '<em>Hello</em><br/>anything'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
|
||||||
|
def test_clean_page_2(self):
|
||||||
|
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
|
||||||
|
r = '<em>Hello</em><br/>anything'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
|
||||||
|
def test_clean_page_3(self):
|
||||||
|
# nesting
|
||||||
|
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
|
||||||
|
r = '--..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
|
||||||
|
def test_clean_page_4(self):
|
||||||
|
# aria-hidden
|
||||||
|
s = '--<p aria-hidden=true>xx</p>..'
|
||||||
|
r = '--..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
s = '--<p aria-hidden="true">xx</p>..'
|
||||||
|
r = '--..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
s = '--<p aria-hidden=false>xx</p>..'
|
||||||
|
r = '--<p aria-hidden="false">xx</p>..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
s = '--<p aria-hidden="false">xx</p>..'
|
||||||
|
r = '--<p aria-hidden="false">xx</p>..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
s = '--<p aria-hidden=??>xx</p>..'
|
||||||
|
r = '--<p aria-hidden="??">xx</p>..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
|
||||||
|
def test_clean_page_5(self):
|
||||||
|
# no removal
|
||||||
|
s = '--<p>xx<em>yy</em></p>..'
|
||||||
|
r = '--<p>xx<em>yy</em></p>..'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
|
||||||
|
def test_clean_page_6(self):
|
||||||
|
# self-closing tags to be removed
|
||||||
|
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
|
||||||
|
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
||||||
|
|
||||||
|
def test_clean_page_7(self):
|
||||||
|
s = '--<p rel=search>tt<area /></p>nn'
|
||||||
|
r = '--nn'
|
||||||
|
self.assertEqual(str(clean_page(s)), r)
|
Loading…
Reference in a new issue