Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
51
.gitignore
vendored
Normal file
51
.gitignore
vendored
Normal file
|
@ -0,0 +1,51 @@
|
|||
# Backup files
|
||||
*.~
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
bin/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
NOTES
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
.tox/
|
||||
.coverage
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
htmlcov
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
# mypy cache
|
||||
.mypy_cache
|
||||
|
||||
# Sphinx documentation
|
||||
doc/build/
|
||||
doc/source/reference/
|
||||
|
||||
# tmp dir
|
||||
tmp/
|
30
.pre-commit-config.yaml
Normal file
30
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,30 @@
|
|||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.0.1
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
- id: check-added-large-files
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 21.11b1
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/timothycrosley/isort
|
||||
rev: 5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
args: ["--profile", "black", "--filter-files", "-l", "79"]
|
||||
- repo: https://github.com/myint/autoflake
|
||||
rev: v1.4
|
||||
hooks:
|
||||
- id: autoflake
|
||||
args:
|
||||
[
|
||||
"--in-place",
|
||||
"--remove-all-unused-imports",
|
||||
"--ignore-init-module-imports",
|
||||
"--remove-unused-variables",
|
||||
]
|
46
Pipfile
Normal file
46
Pipfile
Normal file
|
@ -0,0 +1,46 @@
|
|||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
aiohttp = "*"
|
||||
async-lru = "*"
|
||||
asyncpg = "*"
|
||||
beautifulsoup4 = "*"
|
||||
elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] }
|
||||
elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" }
|
||||
feedparser = "*"
|
||||
gcld3 = "*"
|
||||
# TODO: recheck
|
||||
pypandoc = "*"
|
||||
pytidylib = "*"
|
||||
pytz = "*"
|
||||
pyyaml = "*"
|
||||
tika = "*"
|
||||
tldextract = "*"
|
||||
voluptuous = "*"
|
||||
simhash = "*"
|
||||
async-dns = "*"
|
||||
types-pyyaml = "*"
|
||||
sphinx-rtd-theme = "*"
|
||||
|
||||
[dev-packages]
|
||||
mypy = "*"
|
||||
pre-commit = "*"
|
||||
sphinx = "*"
|
||||
myst-parser = "*"
|
||||
isort = "*"
|
||||
blacken-docs = "*"
|
||||
pybetter = "*"
|
||||
interrogate = "*"
|
||||
autoflake = "*"
|
||||
types-pyyaml = "*"
|
||||
types-pytz = "*"
|
||||
black = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.9"
|
||||
|
||||
[pipenv]
|
||||
allow_prereleases = true
|
1561
Pipfile.lock
generated
Normal file
1561
Pipfile.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
13
README.md
Normal file
13
README.md
Normal file
|
@ -0,0 +1,13 @@
|
|||
atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search.
|
||||
|
||||
Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch)
|
||||
|
||||
atextcrawler crawls and indexes selected websites.
|
||||
It starts from a few seed sites and follows their external links.
|
||||
Criteria defined in plugin code determine which linked sites (and
|
||||
which of their resources) are (recursively) added to the pool.
|
||||
|
||||
atextcrawler is written in Python, runs a configurable number of
|
||||
async workers concurrently (in one process), uses tensorflow for
|
||||
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||
and stores metadata in PostgreSQL and texts in elasticsearch.
|
20
doc/Makefile
Normal file
20
doc/Makefile
Normal file
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
71
doc/source/conf.py
Normal file
71
doc/source/conf.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
|
||||
import os
|
||||
import sys
|
||||
|
||||
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
|
||||
sys.path.insert(0, proj_dir + '/src')
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'atextcrawler'
|
||||
copyright = '2021, ibu radempa'
|
||||
author = 'ibu radempa'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.1.0'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.autosummary',
|
||||
'myst_parser',
|
||||
'sphinx.ext.graphviz',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
autosummary_generate = True
|
||||
|
||||
source_suffix = {
|
||||
'.rst': 'restructuredtext',
|
||||
'.md': 'markdown',
|
||||
}
|
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Initial URLs (first run only)
|
||||
#
|
||||
# To whitelist a URL prepend '+', to blacklist prepend '-'.
|
||||
# Comment lines must begin with '#'.
|
||||
|
||||
# de
|
||||
+http://agd.blogsport.de/
|
||||
+https://blackblogs.org/blogs/
|
||||
+https://fau.org/
|
||||
+http://anarchiv.de/
|
||||
+http://olaf.bbm.de/die-aktion
|
||||
-https://www.anarchistischefoderation.de/
|
||||
|
||||
# en
|
||||
+https://anarchistarchivist.com/
|
||||
+https://bookshelf.theanarchistlibrary.org/library/
|
||||
+https://archive.elephanteditions.net/library/
|
||||
+https://blackrosefed.org/
|
||||
+https://alpineanarchist.org/
|
||||
+https://nostate.net/
|
||||
+https://abolishing.blackblogs.org/
|
||||
+http://library.nothingness.org/
|
||||
-https://www.anarchistfederation.net/
|
88
doc/source/config_template/main.yaml
Normal file
88
doc/source/config_template/main.yaml
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Name of this instance
|
||||
# Default value: atextcrawler
|
||||
# Allowed values: arbitrary string
|
||||
instance_name: atextcrawler
|
||||
|
||||
# Which kind of instance is this?
|
||||
# Default value: prod
|
||||
# Allowed values are:
|
||||
# - 'dev': development instance
|
||||
# - 'staging': staging instance
|
||||
# - 'prod': production instance
|
||||
instance_type: prod
|
||||
|
||||
# Log level
|
||||
# Default value: info
|
||||
# Allowed values: critical, error, warning, info, debug
|
||||
log_level: info
|
||||
|
||||
# Plugins directory
|
||||
# If given as relative path, it will be relative to the
|
||||
# directory of this file (main.yaml).
|
||||
# Read documentation on plugins.
|
||||
# Default value: plugins
|
||||
# Hint: Create a empty __init__.py in the plugins_dir.
|
||||
plugins_dir: plugins
|
||||
|
||||
# Parameters for access to the PostgreSQL service
|
||||
# No default values; must be set.
|
||||
postgresql:
|
||||
host: localhost
|
||||
port: 5432
|
||||
database: atextcrawler
|
||||
user: atextcrawler
|
||||
password: ________________________
|
||||
|
||||
# Crawling
|
||||
crawl:
|
||||
# Number of concurrent workers
|
||||
# Default value: 10
|
||||
# Allowed values: integer >=0 and <=1000
|
||||
#workers: 3
|
||||
|
||||
# Delay in seconds between attempts to fetch items
|
||||
# from site_queue if the last attempt gave no item
|
||||
# Also the delay in seconds after a worker has found
|
||||
# no site to process
|
||||
# Default value: 600
|
||||
# Allowed values: positive number
|
||||
#site_delay: 10
|
||||
|
||||
# Time interval in seconds between site updates when
|
||||
# handling queued base URLs
|
||||
# Default value: 3600
|
||||
# Allowed values: positive number
|
||||
#site_revisit_interval: 3600
|
||||
|
||||
# Delay in seconds between attempts to process
|
||||
# individual resources (pages etc.) of a site
|
||||
# Default value: 5
|
||||
# Allowed values: positive number
|
||||
#resource_delay: 3
|
||||
|
||||
# Default interval in seconds between full crawls of a site
|
||||
# Default value: 864000 (10 days)
|
||||
# Allowed values: positive number
|
||||
#full_crawl_interval: 864000
|
||||
|
||||
# Default interval in seconds between feed crawls of a site
|
||||
# Default value: 86400 (1 day)
|
||||
# Allowed values: positive number
|
||||
#feed_crawl_interval: 86400
|
||||
|
||||
# Parameters for access to the ElasticSearch service
|
||||
# No default values; must be set.
|
||||
elasticsearch:
|
||||
# host on which ES is running
|
||||
host: localhost
|
||||
# API key for accessing ES
|
||||
api_key: "**********************"
|
||||
# API user id
|
||||
id: "**********************"
|
||||
# Index base name (actual index names will have '_text' etc. appended)
|
||||
index_base_name: atext
|
||||
|
||||
# Tensorflow access
|
||||
tensorflow:
|
||||
# The prediction endpoint of the model server's sentence model
|
||||
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|
0
doc/source/config_template/plugins/__init__.py
Normal file
0
doc/source/config_template/plugins/__init__.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Filter paths found in a resource.
|
||||
|
||||
This plugin implements :func:`rp_filter`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def rp_filter(site, durl) -> Optional[str]:
|
||||
"""
|
||||
Adjust or filter found paths (may depend on site).
|
||||
|
||||
To filter out a path (i.e., not add it to table `site_path`)
|
||||
return None.
|
||||
"""
|
||||
path = durl.pwa()
|
||||
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||
return None
|
||||
path = path.removesuffix('?amp=1')
|
||||
return path
|
47
doc/source/config_template/plugins/filter_site.py
Normal file
47
doc/source/config_template/plugins/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Relevance estimation of sites.
|
||||
|
||||
This plugin implements :func:`site_filter`.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from atextcrawler.models import Site
|
||||
|
||||
MIN_RELEVANCE_SCORE = 5
|
||||
|
||||
|
||||
async def site_filter(site: Site) -> bool:
|
||||
"""
|
||||
Assess relevance of the site (using language-dependent criteria).
|
||||
|
||||
If the site shall be crawled, return True, else False.
|
||||
"""
|
||||
# limit to sites in English or German language
|
||||
if not set(['de', 'en']) & set(site.langs):
|
||||
return False
|
||||
score = 0.0
|
||||
for crit_name, weight, langs, crit_re in re_criteria:
|
||||
if '*' in langs or set(langs) & set(site.langs):
|
||||
findings = crit_re.findall(site.startpage_text)
|
||||
if findings:
|
||||
score += weight * len(findings)
|
||||
if site.title and crit_re.search(site.title):
|
||||
score += 4 * weight
|
||||
if site.description and crit_re.search(site.description):
|
||||
score += 4 * weight
|
||||
|
||||
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||
|
||||
return score >= MIN_RELEVANCE_SCORE
|
||||
|
||||
|
||||
re_criteria = {
|
||||
(
|
||||
'anarch',
|
||||
1.0,
|
||||
('*',),
|
||||
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||
),
|
||||
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||
}
|
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Plugin for filtering paths of a site to be retrieved.
|
||||
|
||||
This plugin implements :func:`sp_filter`.
|
||||
"""
|
||||
|
||||
|
||||
def sp_filter(site, path, robots) -> bool:
|
||||
"""
|
||||
Per-site path filter. Return whether the path shall be retrieved.
|
||||
"""
|
||||
if not robots.can_fetch_url(site.base_url + path):
|
||||
return False
|
||||
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||
if any(
|
||||
[
|
||||
path.endswith(end)
|
||||
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||
]
|
||||
):
|
||||
return False
|
||||
if '/bbselect?' in path:
|
||||
return False
|
||||
return True
|
63
doc/source/devel/devel.md
Normal file
63
doc/source/devel/devel.md
Normal file
|
@ -0,0 +1,63 @@
|
|||
## Setup dev environment
|
||||
1. You need python 3.9 or later.
|
||||
1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
|
||||
1. Clone the repo and setup a virtualenv:
|
||||
```
|
||||
cd YOUR_DEV_DIR
|
||||
git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
|
||||
cd atextcrawler
|
||||
pipenv install -d
|
||||
```
|
||||
|
||||
## Configure the instance
|
||||
See [installation](installation.md).
|
||||
|
||||
## Run
|
||||
```
|
||||
python -m atextcrawler
|
||||
```
|
||||
|
||||
## Logging
|
||||
Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
|
||||
```
|
||||
journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
|
||||
```
|
||||
|
||||
## Upgrading
|
||||
Upgrade dev tools:
|
||||
```
|
||||
pre-commit autoupdate
|
||||
```
|
||||
|
||||
## Test and clean manually
|
||||
```
|
||||
AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
|
||||
mypy --ignore-missing-imports src/atextcrawler
|
||||
isort src/atextcrawler
|
||||
black -S -t py37 -l 79 src/atextcrawler
|
||||
pybetter --exclude B004,B007,B008 src/atextcrawler
|
||||
interrogate -i -I -m -v src/atextcrawler
|
||||
```
|
||||
|
||||
## Release
|
||||
There are no releases (currently).
|
||||
|
||||
## Useful commands
|
||||
|
||||
### Fetch a resource or a site manually
|
||||
```
|
||||
python -m atextcrawler.resource https://www.katesharpleylibrary.net/
|
||||
python -m atextcrawler.site https://www.katesharpleylibrary.net/
|
||||
```
|
||||
|
||||
### SQL
|
||||
```
|
||||
drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
|
||||
|
||||
http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
|
||||
|
||||
http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
|
||||
|
||||
-- stats: sites, paths, resources
|
||||
select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
|
||||
```
|
64
doc/source/devel/related_work.md
Normal file
64
doc/source/devel/related_work.md
Normal file
|
@ -0,0 +1,64 @@
|
|||
## Related work
|
||||
* [collection of crawlers](https://github.com/adbar/awesome-crawler)
|
||||
* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
|
||||
|
||||
### crawlers
|
||||
* [acrawler](https://acrawler.readthedocs.io/en/latest/)
|
||||
* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
|
||||
* [repo](https://github.com/adbar/trafilatura)
|
||||
* [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
|
||||
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
|
||||
* [scrapy](https://docs.scrapy.org/en/latest/)
|
||||
* [heritrix3](https://github.com/internetarchive/heritrix3/)
|
||||
* [YaCy](https://yacy.net/)
|
||||
* [searchmysite](https://searchmysite.net/)
|
||||
* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
|
||||
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
|
||||
* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
|
||||
* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
|
||||
|
||||
#### general
|
||||
* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
|
||||
|
||||
### sitemap parsers
|
||||
* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
|
||||
|
||||
### url handling
|
||||
* [courlan](https://pypi.org/project/courlan/)
|
||||
|
||||
### language detection
|
||||
* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
|
||||
* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
|
||||
* [guess_language](https://pypi.org/project/guess-language/)
|
||||
* [cld3](https://github.com/google/cld3)
|
||||
|
||||
### text extraction
|
||||
* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
|
||||
|
||||
### deduplication
|
||||
* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
|
||||
* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
|
||||
* remove paragraphs with more than 50% word-7-tuples encountered previously
|
||||
|
||||
### Extract more meta tags
|
||||
* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
|
||||
https://support.shareaholic.com/hc/en-us/articles/115003085186
|
||||
|
||||
### Date parsing dependent on language
|
||||
* https://en.wikipedia.org/wiki/Date_format_by_country
|
||||
* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
|
||||
* https://pypi.org/project/dateparser/
|
||||
* https://github.com/ovalhub/pyicu
|
||||
* https://github.com/night-crawler/cldr-language-helpers
|
||||
* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
|
||||
|
||||
ICU
|
||||
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
|
||||
* https://gist.github.com/dpk/8325992
|
||||
* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
|
||||
* https://unicode-org.github.io/icu/userguide/
|
||||
* https://unicode-org.github.io/icu-docs/#/icu4c/
|
||||
* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
|
||||
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
|
||||
* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
|
||||
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
|
77
doc/source/devel/todo.md
Normal file
77
doc/source/devel/todo.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
## TODO
|
||||
|
||||
* parse html time tags
|
||||
|
||||
* site annotations:
|
||||
* categories
|
||||
* historical (no changes any more since n months)
|
||||
* news
|
||||
* local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
|
||||
|
||||
* allow for tls in elasticsearch config
|
||||
|
||||
* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
|
||||
```
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'„': '"',
|
||||
'″': '"',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'„': '"',
|
||||
'″': '"',
|
||||
'“':'"',
|
||||
'”':'"',
|
||||
'„':'"',
|
||||
'″':'"',
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
```
|
||||
* normalize quotation marks and punctuation in general
|
||||
* https://unicode-table.com/en/sets/quotation-marks/
|
||||
* https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
|
||||
* https://www.fileformat.info/info/unicode/category/Po/list.htm
|
||||
* https://www.gaijin.at/en/infos/unicode-character-table-punctuation
|
||||
* ⁝
|
||||
|
||||
* cancel crawls that take too long
|
||||
|
||||
* search for "TODO" in code
|
||||
|
||||
* feedparser has support for JSON feeds since commit
|
||||
a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
|
||||
(as of 2020-10-26 in "develop" branch, not part of a release)
|
||||
the version names are 'json1' and 'json11'
|
||||
|
||||
* allow site URLs with path, e.g.
|
||||
https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
|
||||
|
||||
* add more languages
|
||||
|
||||
## Ideas
|
||||
* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
|
||||
|
||||
* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
|
||||
* [langid.py](https://github.com/saffsd/langid.py)
|
||||
|
||||
* [gain](https://github.com/gaojiuli/gain)
|
||||
* [ruia](https://docs.python-ruia.org/)
|
||||
* [demiurge](https://demiurge.readthedocs.io/)
|
||||
* [cocrawler](https://github.com/cocrawler/cocrawler/)
|
||||
* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
|
9
doc/source/development.rst
Normal file
9
doc/source/development.rst
Normal file
|
@ -0,0 +1,9 @@
|
|||
Development
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
devel/devel
|
||||
devel/todo
|
||||
devel/related_work
|
119
doc/source/elasticsearch.md
Normal file
119
doc/source/elasticsearch.md
Normal file
|
@ -0,0 +1,119 @@
|
|||
# Howto elasticsearch
|
||||
|
||||
## Prerequisites
|
||||
On the host (virtualization host) we need:
|
||||
```
|
||||
# cat /etc/sysctl.d/virtual_memory.conf
|
||||
vm.max_map_count=262144
|
||||
# sysctl -p /etc/sysctl.d/virtual_memory.conf
|
||||
```
|
||||
|
||||
If this cannot be done, change this file after installing or upgrading elasticsearch:
|
||||
```
|
||||
/usr/lib/sysctl.d/elasticsearch.conf
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### Install package
|
||||
In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
|
||||
|
||||
We do a manual install. If you configure the apt repo instead, also think about setting
|
||||
`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
|
||||
|
||||
```
|
||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
|
||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
|
||||
shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
|
||||
dpkg -i elasticsearch-7.15.2-amd64.deb
|
||||
systemctl daemon-reload
|
||||
systemctl enable elasticsearch.service
|
||||
systemctl start elasticsearch.service
|
||||
```
|
||||
|
||||
First test:
|
||||
```
|
||||
http -j GET 127.0.0.1:9200/
|
||||
```
|
||||
|
||||
### Storage
|
||||
|
||||
```
|
||||
systemctl stop elasticsearch.service
|
||||
mv /var/lib/elasticsearch/ /srv/
|
||||
systemctl start elasticsearch.service
|
||||
```
|
||||
|
||||
Edit /etc/elasticsearch/elasticsearch.yml
|
||||
```
|
||||
cluster.name: org.a-text.search
|
||||
node.name: atext1
|
||||
path.data: /srv/elasticsearch
|
||||
path.logs: /var/log/elasticsearch
|
||||
discovery.seed_hosts: ["atext1.multiname.org"]
|
||||
xpack.security.enabled: true
|
||||
xpack.security.authc.api_key.enabled: true
|
||||
```
|
||||
|
||||
```
|
||||
systemctl restart elasticsearch
|
||||
```
|
||||
|
||||
The logfile now is at
|
||||
```
|
||||
/var/log/elasticsearch/org.a-text.search.log
|
||||
```
|
||||
|
||||
### Setup passwords
|
||||
Setup passwords:
|
||||
```
|
||||
# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
|
||||
Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
|
||||
The passwords will be randomly generated and printed to the console.
|
||||
Please confirm that you would like to continue [y/N]y
|
||||
```
|
||||
|
||||
Copy output to /etc/elasticsearch/passwords and
|
||||
```
|
||||
chmod 400 /etc/elasticsearch/passwords
|
||||
```
|
||||
|
||||
Check login as user elastic:
|
||||
```
|
||||
http --auth elastic:************** -j GET http://127.0.0.1:9200/
|
||||
```
|
||||
|
||||
### Memory limitation
|
||||
To limit memory usage
|
||||
```
|
||||
mkdir /etc/systemd/system/elasticsearch.service.d
|
||||
cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
|
||||
[Service]
|
||||
LimitMEMLOCK=8G
|
||||
|
||||
systemctl stop elasticsearch
|
||||
systemctl daemon-reload
|
||||
systemctl start elasticsearch
|
||||
EOF
|
||||
```
|
||||
and restart the service.
|
||||
|
||||
## Usage
|
||||
Some useful requests:
|
||||
|
||||
### List indices
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
|
||||
```
|
||||
### Health
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
|
||||
```
|
||||
### Node attributes
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
|
||||
```
|
||||
### Create API key
|
||||
```
|
||||
http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
|
||||
```
|
37
doc/source/index.rst
Normal file
37
doc/source/index.rst
Normal file
|
@ -0,0 +1,37 @@
|
|||
atextcrawler
|
||||
============
|
||||
|
||||
atextcrawler is an asynchronous webcrawler indexing text
|
||||
for literal and semantic search.
|
||||
|
||||
Its client-side counterpart is atextsearch_.
|
||||
|
||||
atextcrawler crawls and indexes selected websites.
|
||||
It starts from a few seed sites and follows their external links.
|
||||
Criteria defined in plugin code determine which linked sites (and
|
||||
which of their resources) are (recursively) added to the pool.
|
||||
|
||||
atextcrawler is written in Python, runs a configurable number of
|
||||
async workers concurrently (in one process), uses tensorflow for
|
||||
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||
and stores metadata in PostgreSQL and texts in elasticsearch.
|
||||
|
||||
.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
introduction
|
||||
installation
|
||||
maintenance
|
||||
development
|
||||
reference/modules
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
122
doc/source/installation.md
Normal file
122
doc/source/installation.md
Normal file
|
@ -0,0 +1,122 @@
|
|||
# Installation
|
||||
Installation was only tested on Debian bullseye (on amd64).
|
||||
The instructions below are for this system.
|
||||
(Please adapt to other environments.)
|
||||
|
||||
## System packages
|
||||
```
|
||||
apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
|
||||
```
|
||||
The protobuf packages are required for python package gcld3 (see below).
|
||||
|
||||
## PostgreSQL database
|
||||
We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
|
||||
```
|
||||
createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
|
||||
```
|
||||
|
||||
## Elasticsearch
|
||||
We need access to an elasticsearch instance (over TCP/IP).
|
||||
|
||||
Note: TLS is not yet supported, so install this service locally.
|
||||
|
||||
See [elasticsearch howto](elasticsearch.md).
|
||||
|
||||
## Tensorflow model server
|
||||
We need access to a tensorflow model server (over TCP/IP).
|
||||
It should serve `universal_sentence_encoder_multilingual`
|
||||
or a similar language model.
|
||||
|
||||
Note: TLS is not yet supported, so install this service locally.
|
||||
|
||||
See [tensorflow howto](tensorflow_model_server.md).
|
||||
|
||||
## Setup virtualenv and install atextcrawler
|
||||
```
|
||||
apt install python3-pip
|
||||
adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
|
||||
su - atextcrawler
|
||||
cat >>.bashrc <<EOF
|
||||
export PYTHONPATH=\$HOME/repo/src
|
||||
EOF
|
||||
pip3 install --user pipenv
|
||||
cat >>.profile <<EOF
|
||||
PYTHONPATH=\$HOME/repo/src
|
||||
PATH=\$HOME/.local/bin:$PATH
|
||||
\$HOME/.local/bin/pipenv shell
|
||||
EOF
|
||||
exit
|
||||
su - atextcrawler
|
||||
git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
|
||||
cd repo
|
||||
pipenv sync
|
||||
pipenv install --site-packages # for systemd
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
Note: One of the dependencies, Python package `tldextract`,
|
||||
uses this directory for caching:
|
||||
```
|
||||
$HOME/.cache/python-tldextract/
|
||||
```
|
||||
|
||||
## Configure atextcrawler
|
||||
As user `atextcrawler` execute
|
||||
```
|
||||
mkdir $HOME/.config
|
||||
cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
|
||||
```
|
||||
|
||||
Edit `$HOME/.config/atextcrawler/main.yaml`.
|
||||
|
||||
If you want to override a plugin, copy it to the plugins directory
|
||||
and edit it, e.g.
|
||||
```
|
||||
cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
|
||||
```
|
||||
|
||||
Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
|
||||
|
||||
Check (and print) the instance configuration:
|
||||
```
|
||||
python -m atextcrawler.config
|
||||
```
|
||||
|
||||
## Test run
|
||||
To see if it works, run `atextcrawler` from the command line:
|
||||
```
|
||||
python -m atextcrawler
|
||||
```
|
||||
You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
|
||||
|
||||
## Install systemd service
|
||||
To make the service persistent, create a systemd unit file
|
||||
`/etc/systemd/system/atextcrawler.service` with this content:
|
||||
```
|
||||
[Unit]
|
||||
Description=atextcrawler web crawler
|
||||
Documentation=https://gitea.multiname.org/a-text/atextcrawler
|
||||
Requires=network.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=atextcrawler
|
||||
Group=atextcrawler
|
||||
WorkingDirectory=/srv/atextcrawler/repo
|
||||
Environment=PYTHONPATH=/srv/atextcrawler/repo/src
|
||||
ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
|
||||
TimeoutStartSec=30
|
||||
ExecStop=/bin/kill -INT $MAINPID
|
||||
TimeoutStopSec=180
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
and
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable atextcrawler
|
||||
systemctl start atextcrawler
|
||||
```
|
66
doc/source/introduction.md
Normal file
66
doc/source/introduction.md
Normal file
|
@ -0,0 +1,66 @@
|
|||
# Introduction
|
||||
|
||||
## What atextcrawler does:
|
||||
* Start from a seed (white+black-)list of website base URLs
|
||||
* Loop over sites selected by applying criteria to the content
|
||||
of the site's start page
|
||||
* Crawl the site, i.e. loop over resources of the site
|
||||
* Extract plaintext content from the resource (html parsing is
|
||||
optimized for html5); discard non-text content, but handle feeds
|
||||
and sitemaps
|
||||
* Extract internal and external links; external links contribute
|
||||
to the site list
|
||||
* Keep track of the sites and resources in a PostgreSQL database
|
||||
* Store plaintext content of resources in an Elasticsearch index
|
||||
* Store vector embeddings of plaintexts also in Elasticsearch
|
||||
using tensorflow model server with a multilingual language model
|
||||
|
||||
## Architecture
|
||||
There is only one python process running concurrently.
|
||||
We use asyncio where possible (almost everywhere).
|
||||
|
||||
1. There is a queue of websites, see database table `site_queue`.
|
||||
The queue is fed a) on first startup with seeds, b) manually
|
||||
and c) from crawls which find external links.
|
||||
When the queued is handled new sites are stored to table `site`.
|
||||
New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
|
||||
After the queue has been handled there is a delay
|
||||
(`crawl.site_delay` seconds) before repetition.
|
||||
1. Updating a site means: the start page is fetched and
|
||||
criteria are applied to its content to determine whether
|
||||
the site is relevant. (It is assumed that (non-)relevance is
|
||||
obvious from the start page already.) If the site is relevant,
|
||||
more information is fetched (e.g. sitemaps).
|
||||
1. There is s a configurable number of crawler workers (config
|
||||
`crawl.workers`) which concurrently crawl sites, one at a time
|
||||
per worker. (During the crawl the site is marked as locked using
|
||||
crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
|
||||
Each crawl (with begin time, end time, number of found (new)
|
||||
resources)) is stored in table `crawl`.
|
||||
1. Crawls are either full crawls (including all paths reachable
|
||||
through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
|
||||
Feed crawls can happen more frequently (e.g. daily).
|
||||
1. When a path is fetched it can result in a MetaResource (feed or
|
||||
sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
|
||||
1. If a MetaResource is fetched and it is a sitemap, its paths are
|
||||
added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
|
||||
1. Links between sites are stored in table `site_link`.
|
||||
|
||||
## Site annotations
|
||||
Database table `site_annotation` can have any number of annotations
|
||||
for a base_url. While crawling, these annotations are considered:
|
||||
Blacklisting or whitelisting has precedence over function `site_filter`
|
||||
(in plugin `filter_site`).
|
||||
|
||||
Annotations cannot be managed from within atextcrawler;
|
||||
this requires another application, usually [`atextsearch`](https://TODO).
|
||||
|
||||
Each annotation requires a base_url of the annotated site and
|
||||
if a site with this base_url exists in the `site` table,
|
||||
it should also be associated with the site's id (column `site_id`).
|
||||
|
||||
## Limitations
|
||||
* atextcrawler is not optimized for speed; it is meant to be run as a
|
||||
background task on a server with limited resources
|
||||
(or even an SBC, like raspberry pi, with attached storage)
|
||||
* atextcrawler only indexes text, no other resources like images
|
23
doc/source/maintenance.md
Normal file
23
doc/source/maintenance.md
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Maintenance
|
||||
|
||||
## Upgrading
|
||||
```
|
||||
su - atextcrawler
|
||||
pip3 install --user --upgrade pipenv
|
||||
cd repo
|
||||
git pull
|
||||
pipenv sync
|
||||
systemctl restart atextcrawler
|
||||
```
|
||||
|
||||
## Update tldextract
|
||||
From time to time run (in the Python virtualenv):
|
||||
```
|
||||
tldextract --update
|
||||
```
|
||||
or
|
||||
```
|
||||
systemctl stop atextcrawler
|
||||
rm -r $HOME/.cache/python-tldextract
|
||||
systemctl start atextcrawler
|
||||
```
|
98
doc/source/tensorflow_model_server.md
Normal file
98
doc/source/tensorflow_model_server.md
Normal file
|
@ -0,0 +1,98 @@
|
|||
# Tensorflow model server
|
||||
|
||||
## Setup server
|
||||
Prepare:
|
||||
```
|
||||
apt install gnupg2
|
||||
```
|
||||
Add repo:
|
||||
```
|
||||
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
|
||||
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
|
||||
```
|
||||
Install package:
|
||||
```
|
||||
apt update
|
||||
apt install tensorflow-model-server
|
||||
```
|
||||
|
||||
## Setup models
|
||||
```
|
||||
mkdir -p /srv/tensorflow/workdir
|
||||
mkdir -p /srv/tensorflow/models
|
||||
```
|
||||
Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
|
||||
```
|
||||
# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
|
||||
mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||
cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||
wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
|
||||
tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
|
||||
rm universal-sentence-encoder-multilingual_3.tar.gz
|
||||
```
|
||||
|
||||
Check:
|
||||
```
|
||||
tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
|
||||
```
|
||||
|
||||
Config file `/srv/tensorflow/config`:
|
||||
```
|
||||
model_config_list: {
|
||||
config: {
|
||||
name: "sentences",
|
||||
base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
|
||||
model_platform: "tensorflow"
|
||||
model_version_policy: {latest{}},
|
||||
},
|
||||
config: {
|
||||
... (next model)
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
## Systemd integration
|
||||
Edit /etc/systemd/system/tensorflow.service
|
||||
```
|
||||
[Unit]
|
||||
Description=tensorflow model server
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/srv/tensorflow/workdir
|
||||
ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
RestartSec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
and
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable tensorflow
|
||||
systemctl start tensorflow
|
||||
```
|
||||
|
||||
Check:
|
||||
```
|
||||
http -j GET http://localhost:9000/v1/models/sentences
|
||||
```
|
||||
|
||||
## Usage
|
||||
Show model details:
|
||||
```
|
||||
http -j GET http://localhost:9000/v1/models/sentences/metadata
|
||||
```
|
||||
|
||||
## Docs
|
||||
|
||||
* `/usr/bin/tensorflow_model_server --help`
|
||||
* https://github.com/tensorflow/serving/
|
||||
* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
|
||||
* https://github.com/hey-car/tensorflow-model-server
|
||||
|
||||
Datasets:
|
||||
* https://www.tensorflow.org/datasets/catalog/overview
|
48
license.txt
Normal file
48
license.txt
Normal file
|
@ -0,0 +1,48 @@
|
|||
ANTI-AUTHORITARIAN LICENSE version 1.0
|
||||
________________________________________________________________________________
|
||||
|
||||
Obviously, this license is relevant to all who are bound by law.
|
||||
|
||||
The licensee ("you") must not be a commercial, military, clerical or
|
||||
governmental entity. For this license the term "software" means the program
|
||||
code, documentation as well as other data (for instance, language files).
|
||||
|
||||
Subject to the respective terms and conditions described below the licensee
|
||||
is granted the non-exclusive and non-transferable license to:
|
||||
A. make copies of the software
|
||||
B. create derivative works ("modifications")
|
||||
C. install and run copies or modifications of the software on any number of
|
||||
servers, thereby making them usable for the licensee and possibly others
|
||||
D. offer or give copies or modifications of the software, or parts of the
|
||||
unmodified or modified software to others
|
||||
|
||||
For these permissions the respective conditions stated below must be met:
|
||||
* For permission A condition 1 must be met.
|
||||
* For permission B all of the conditions 1, 3, 4 must be met.
|
||||
* For permission C all of the conditions 2, 3 must be met.
|
||||
* For permission D all of the conditions 1, 2, 3, 4, 5 must be met.
|
||||
|
||||
These are the conditions:
|
||||
1. You include this copyright notice and license in any copy or modification.
|
||||
In files that contain a reference to it you preserve this reference.
|
||||
2. You do not use this software or any modification of it for any commercial
|
||||
purpose or for monetary gain, and also not for any military, governmental
|
||||
or religious purpose; here with commercial purpose we mean activities which
|
||||
have among their goals to make profit, be it monetary profit or any other
|
||||
kind of profit that may entail or contribute to monetary profit.
|
||||
3. Demos or screenshots of the modified or unmodified software must not be
|
||||
published in any medium which requires the viewers to pay money in order
|
||||
to see the contents; here money paid for mere internet connectivity (i.e.,
|
||||
independent of the content supplier) is to be disregarded.
|
||||
4. You do not impose any further restrictions on this software or any
|
||||
derivative works beyond those restrictions herein.
|
||||
5. The copy or modification must include source code, and must allow
|
||||
distribution in source code as well as compiled form. The source code
|
||||
must be the preferred form in which a programmer would modify the program.
|
||||
Deliberately obfuscated source code is not allowed. Intermediate forms
|
||||
such as the output of a preprocessor or translator are not allowed.
|
||||
|
||||
For this license itself, if re-used for other software, the following
|
||||
copyright and license applies (copyheart license):
|
||||
|
||||
♡ Copying is an act of love. Please copy.
|
10
pyproject.toml
Normal file
10
pyproject.toml
Normal file
|
@ -0,0 +1,10 @@
|
|||
# TOML formatted file; see PEP 518
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
#multi_line_output = 3
|
||||
|
||||
[tool.black]
|
||||
line-length = 79
|
||||
target_version = ['py39']
|
||||
skip-string-normalization = true
|
0
src/atextcrawler/__init__.py
Normal file
0
src/atextcrawler/__init__.py
Normal file
12
src/atextcrawler/__main__.py
Normal file
12
src/atextcrawler/__main__.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
"""
|
||||
atextcrawler application execution entry point.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from .application import Application
|
||||
from .config import Config
|
||||
|
||||
if __name__ == '__main__':
|
||||
config = Config().get()
|
||||
asyncio.run(Application(config).run())
|
204
src/atextcrawler/application.py
Normal file
204
src/atextcrawler/application.py
Normal file
|
@ -0,0 +1,204 @@
|
|||
"""
|
||||
atextcrawler application.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import logging
|
||||
import signal
|
||||
import sys
|
||||
|
||||
from systemd.journal import JournalHandler
|
||||
|
||||
from .config import Config
|
||||
from .crawl import CrawlWorker
|
||||
from .db import PGPool
|
||||
from .search import shutdown_engine, startup_engine
|
||||
from .site import load_seeds, process_site_queue
|
||||
|
||||
plugin_names = ['filter_site', 'filter_site_path', 'filter_resource_path']
|
||||
|
||||
|
||||
class Application:
|
||||
"""
|
||||
atextcrawler application.
|
||||
|
||||
The basic structure of the application is this:
|
||||
* one site crawler works just on the site_queue: fetching start pages
|
||||
of sites and storing updated site information in table sites
|
||||
* N other CrawlWorkers each do this in a loop:
|
||||
checkout a site that is due for crawl and crawl its resources;
|
||||
they fill the site_queue
|
||||
"""
|
||||
|
||||
running = True
|
||||
|
||||
def __init__(self, config=None):
|
||||
if config is None:
|
||||
config = Config().get()
|
||||
self.config = config
|
||||
self.instance_name = config['instance_name']
|
||||
self.instance_type = config['instance_type']
|
||||
log_level = getattr(
|
||||
logging, config['log_level'].upper(), logging.CRITICAL
|
||||
)
|
||||
self.logger = logging.getLogger('atextcrawler')
|
||||
self.logger.setLevel(log_level)
|
||||
if self.instance_type == 'dev':
|
||||
self.logger.addHandler(logging.StreamHandler())
|
||||
else:
|
||||
self.logger.addHandler(
|
||||
JournalHandler(SYSLOG_IDENTIFIER=self.instance_name)
|
||||
)
|
||||
self.logger.propagate = False
|
||||
self.channel = 'atextcrawler_' + self.config['instance_name']
|
||||
msg = f'Instance "{self}" initializing'
|
||||
self.logger.info(msg)
|
||||
self.plugins = self._load_plugins()
|
||||
|
||||
def __str__(self):
|
||||
return self.instance_name
|
||||
|
||||
def _load_plugins(self):
|
||||
"""
|
||||
Return a dict mapping plugin names to modules.
|
||||
"""
|
||||
modules = {}
|
||||
old_path = sys.path
|
||||
for name in plugin_names:
|
||||
try:
|
||||
plugins_dir = self.config['plugins_dir']
|
||||
sys.path.insert(0, plugins_dir)
|
||||
module = importlib.import_module(name)
|
||||
msg = f'Loading plugin "{name}" from {plugins_dir}'
|
||||
except:
|
||||
module = importlib.import_module(
|
||||
'atextcrawler.plugin_defaults.' + name
|
||||
)
|
||||
msg = f'Loading plugin "{name}" from default location'
|
||||
self.logger.info(msg)
|
||||
modules[name] = module
|
||||
sys.path = old_path
|
||||
return modules
|
||||
|
||||
async def run(self):
|
||||
"""
|
||||
Application lifecycle.
|
||||
"""
|
||||
await asyncio.gather(self.wait_for_shutdown(), self.startup())
|
||||
await self.shutdown()
|
||||
|
||||
async def startup(self):
|
||||
"""
|
||||
Asynchronous startup.
|
||||
"""
|
||||
msg = f'Instance "{self}" starting components'
|
||||
self.logger.info(msg)
|
||||
self.search_engine = await startup_engine(self.config)
|
||||
self.pgpool = await PGPool(self.config['postgresql'])
|
||||
self.pool = self.pgpool.pool
|
||||
await load_seeds(self.config, self.pool)
|
||||
await reset_site_locks(self.pool)
|
||||
worker_count = self.config['crawl']['workers']
|
||||
self.workers = []
|
||||
for worker_number in range(worker_count):
|
||||
worker = await CrawlWorker(self, worker_number, self.pool)
|
||||
self.workers.append(worker)
|
||||
worker_coros = [worker.run() for worker in self.workers]
|
||||
await asyncio.gather(
|
||||
process_site_queue(self, self.pool),
|
||||
self.handle_notifications(),
|
||||
*worker_coros,
|
||||
)
|
||||
|
||||
async def wait_for_shutdown(self):
|
||||
"""
|
||||
Create a shutdown event (:class:`asyncio.Event`) and wait for it.
|
||||
|
||||
The event will be set by a signal handler for SIGINT
|
||||
and SIGTERM signals (see :meth:`Application.handle_shutdown_signal`).
|
||||
"""
|
||||
self.shutdown_event = asyncio.Event()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
asyncio.get_running_loop().add_signal_handler(
|
||||
sig, self.handle_shutdown_signal
|
||||
)
|
||||
self.logger.debug(f'{self} waiting for shutdown event')
|
||||
await self.shutdown_event.wait()
|
||||
self.logger.info(f'Instance "{self}" shutdown event')
|
||||
|
||||
def handle_shutdown_signal(self):
|
||||
"""
|
||||
Handle shutdown signal.
|
||||
"""
|
||||
if self.shutdown_event.is_set():
|
||||
return
|
||||
self.shutdown_event.set()
|
||||
self.running = False
|
||||
|
||||
async def shutdown(self):
|
||||
"""
|
||||
Asynchronous shutdown.
|
||||
"""
|
||||
self.logger.debug(f'Instance "{self}" shutting down')
|
||||
await self.notify_conn.remove_listener(
|
||||
self.channel, self.listen_callback
|
||||
)
|
||||
await self.pool.release(self.notify_conn)
|
||||
for worker in self.workers:
|
||||
await worker.shutdown()
|
||||
await shutdown_engine(self.search_engine)
|
||||
await self.pgpool.shutdown()
|
||||
self.logger.info(f'Instance "{self}" shutdown completed')
|
||||
|
||||
async def handle_notifications(self):
|
||||
"""
|
||||
Handle notifications using PostgreSQL's NOTIFY/LISTEN.
|
||||
"""
|
||||
self.notify_conn = await self.pool.acquire()
|
||||
await self.notify_conn.add_listener(self.channel, self.listen_callback)
|
||||
|
||||
def listen_callback(self, *args):
|
||||
"""
|
||||
Handle notify event from PostgreSQL.
|
||||
"""
|
||||
channel = args[2]
|
||||
if channel != self.channel:
|
||||
return
|
||||
message = args[3]
|
||||
if message.startswith('site_update '):
|
||||
try:
|
||||
site_id = int(message.removeprefix('site_update '))
|
||||
for worker in self.workers:
|
||||
if worker.site and site_id == worker.site.id_:
|
||||
msg = (
|
||||
f'Cancelling worker {worker.worker_number}'
|
||||
f' (site={site_id}) due to site_update'
|
||||
)
|
||||
self.logger.info(msg)
|
||||
worker.running = False
|
||||
except:
|
||||
pass
|
||||
|
||||
async def sleep(self, duration, t_slice=3):
|
||||
"""
|
||||
Sleep for *duration* seconds while self.running.
|
||||
|
||||
Check self.running every *t_slice* seconds.
|
||||
"""
|
||||
remaining = duration
|
||||
while remaining > 0 and self.running:
|
||||
await asyncio.sleep(min(t_slice, remaining))
|
||||
remaining -= t_slice
|
||||
|
||||
|
||||
async def reset_site_locks(pool):
|
||||
"""
|
||||
Remove locks leftover from last run: Set crawl_active=false for all sites.
|
||||
|
||||
This is relevant when the application was not shutdown properly (e.g.
|
||||
when the process was killed).
|
||||
"""
|
||||
async with pool.acquire() as conn:
|
||||
sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true"
|
||||
await conn.execute(sql)
|
7
src/atextcrawler/assets/iana_langs
Normal file
7
src/atextcrawler/assets/iana_langs
Normal file
|
@ -0,0 +1,7 @@
|
|||
The recommended language tags to use in webpages are from
|
||||
the IANA Language Subtag Registry (BCP47), see:
|
||||
https://www.w3.org/International/questions/qa-html-language-declarations
|
||||
https://r12a.github.io/app-subtags/
|
||||
|
||||
|
||||
wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_ | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //'
|
219
src/atextcrawler/assets/iso_639-1
Normal file
219
src/atextcrawler/assets/iso_639-1
Normal file
|
@ -0,0 +1,219 @@
|
|||
aa
|
||||
ab
|
||||
ae
|
||||
af
|
||||
ak
|
||||
am
|
||||
an
|
||||
ar
|
||||
as
|
||||
av
|
||||
ay
|
||||
az
|
||||
ba
|
||||
be
|
||||
bg
|
||||
bh
|
||||
bi
|
||||
bm
|
||||
bn
|
||||
bo
|
||||
br
|
||||
bs
|
||||
ca
|
||||
ca
|
||||
ce
|
||||
ch
|
||||
co
|
||||
cr
|
||||
cs
|
||||
cu
|
||||
cu
|
||||
cu
|
||||
cu
|
||||
cu
|
||||
cv
|
||||
cy
|
||||
da
|
||||
de
|
||||
dv
|
||||
dv
|
||||
dv
|
||||
dz
|
||||
ee
|
||||
el
|
||||
en
|
||||
eo
|
||||
es
|
||||
es
|
||||
et
|
||||
eu
|
||||
fa
|
||||
ff
|
||||
fi
|
||||
fj
|
||||
fo
|
||||
fr
|
||||
fy
|
||||
ga
|
||||
gd
|
||||
gd
|
||||
gl
|
||||
gn
|
||||
gu
|
||||
gv
|
||||
ha
|
||||
he
|
||||
hi
|
||||
ho
|
||||
hr
|
||||
ht
|
||||
ht
|
||||
hu
|
||||
hy
|
||||
hz
|
||||
ia
|
||||
id
|
||||
ie
|
||||
ie
|
||||
ig
|
||||
ii
|
||||
ii
|
||||
ik
|
||||
io
|
||||
is
|
||||
it
|
||||
iu
|
||||
ja
|
||||
jv
|
||||
ka
|
||||
kg
|
||||
ki
|
||||
ki
|
||||
kj
|
||||
kj
|
||||
kk
|
||||
kl
|
||||
kl
|
||||
km
|
||||
kn
|
||||
ko
|
||||
kr
|
||||
ks
|
||||
ku
|
||||
kv
|
||||
kw
|
||||
ky
|
||||
ky
|
||||
la
|
||||
lb
|
||||
lb
|
||||
lg
|
||||
li
|
||||
li
|
||||
li
|
||||
ln
|
||||
lo
|
||||
lt
|
||||
lu
|
||||
lv
|
||||
mg
|
||||
mh
|
||||
mi
|
||||
mk
|
||||
ml
|
||||
mn
|
||||
mr
|
||||
ms
|
||||
mt
|
||||
my
|
||||
na
|
||||
nb
|
||||
nb
|
||||
nd
|
||||
nd
|
||||
ne
|
||||
ng
|
||||
nl
|
||||
nl
|
||||
nn
|
||||
nn
|
||||
no
|
||||
nr
|
||||
nr
|
||||
nv
|
||||
nv
|
||||
ny
|
||||
ny
|
||||
ny
|
||||
oc
|
||||
oj
|
||||
om
|
||||
or
|
||||
os
|
||||
os
|
||||
pa
|
||||