Put under version control

This commit is contained in:
ibu 2021-11-29 09:16:31 +00:00
parent d26d23348b
commit a6af5b12d2
83 changed files with 20130 additions and 0 deletions

71
doc/source/conf.py Normal file
View file

@ -0,0 +1,71 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
import os
import sys
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
sys.path.insert(0, proj_dir + '/src')
# -- Project information -----------------------------------------------------
project = 'atextcrawler'
copyright = '2021, ibu radempa'
author = 'ibu radempa'
# The full version, including alpha/beta/rc tags
release = '0.1.0'
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'myst_parser',
'sphinx.ext.graphviz',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
autosummary_generate = True
source_suffix = {
'.rst': 'restructuredtext',
'.md': 'markdown',
}

View file

@ -0,0 +1,23 @@
# Initial URLs (first run only)
#
# To whitelist a URL prepend '+', to blacklist prepend '-'.
# Comment lines must begin with '#'.
# de
+http://agd.blogsport.de/
+https://blackblogs.org/blogs/
+https://fau.org/
+http://anarchiv.de/
+http://olaf.bbm.de/die-aktion
-https://www.anarchistischefoderation.de/
# en
+https://anarchistarchivist.com/
+https://bookshelf.theanarchistlibrary.org/library/
+https://archive.elephanteditions.net/library/
+https://blackrosefed.org/
+https://alpineanarchist.org/
+https://nostate.net/
+https://abolishing.blackblogs.org/
+http://library.nothingness.org/
-https://www.anarchistfederation.net/

View file

@ -0,0 +1,88 @@
# Name of this instance
# Default value: atextcrawler
# Allowed values: arbitrary string
instance_name: atextcrawler
# Which kind of instance is this?
# Default value: prod
# Allowed values are:
# - 'dev': development instance
# - 'staging': staging instance
# - 'prod': production instance
instance_type: prod
# Log level
# Default value: info
# Allowed values: critical, error, warning, info, debug
log_level: info
# Plugins directory
# If given as relative path, it will be relative to the
# directory of this file (main.yaml).
# Read documentation on plugins.
# Default value: plugins
# Hint: Create a empty __init__.py in the plugins_dir.
plugins_dir: plugins
# Parameters for access to the PostgreSQL service
# No default values; must be set.
postgresql:
host: localhost
port: 5432
database: atextcrawler
user: atextcrawler
password: ________________________
# Crawling
crawl:
# Number of concurrent workers
# Default value: 10
# Allowed values: integer >=0 and <=1000
#workers: 3
# Delay in seconds between attempts to fetch items
# from site_queue if the last attempt gave no item
# Also the delay in seconds after a worker has found
# no site to process
# Default value: 600
# Allowed values: positive number
#site_delay: 10
# Time interval in seconds between site updates when
# handling queued base URLs
# Default value: 3600
# Allowed values: positive number
#site_revisit_interval: 3600
# Delay in seconds between attempts to process
# individual resources (pages etc.) of a site
# Default value: 5
# Allowed values: positive number
#resource_delay: 3
# Default interval in seconds between full crawls of a site
# Default value: 864000 (10 days)
# Allowed values: positive number
#full_crawl_interval: 864000
# Default interval in seconds between feed crawls of a site
# Default value: 86400 (1 day)
# Allowed values: positive number
#feed_crawl_interval: 86400
# Parameters for access to the ElasticSearch service
# No default values; must be set.
elasticsearch:
# host on which ES is running
host: localhost
# API key for accessing ES
api_key: "**********************"
# API user id
id: "**********************"
# Index base name (actual index names will have '_text' etc. appended)
index_base_name: atext
# Tensorflow access
tensorflow:
# The prediction endpoint of the model server's sentence model
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict

View file

@ -0,0 +1,22 @@
"""
Filter paths found in a resource.
This plugin implements :func:`rp_filter`.
"""
from typing import Optional
def rp_filter(site, durl) -> Optional[str]:
"""
Adjust or filter found paths (may depend on site).
To filter out a path (i.e., not add it to table `site_path`)
return None.
"""
path = durl.pwa()
# skip fetching images (linked from a tags; img tags are skipped anyway)
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
return None
path = path.removesuffix('?amp=1')
return path

View file

@ -0,0 +1,47 @@
"""
Relevance estimation of sites.
This plugin implements :func:`site_filter`.
"""
import re
from atextcrawler.models import Site
MIN_RELEVANCE_SCORE = 5
async def site_filter(site: Site) -> bool:
"""
Assess relevance of the site (using language-dependent criteria).
If the site shall be crawled, return True, else False.
"""
# limit to sites in English or German language
if not set(['de', 'en']) & set(site.langs):
return False
score = 0.0
for crit_name, weight, langs, crit_re in re_criteria:
if '*' in langs or set(langs) & set(site.langs):
findings = crit_re.findall(site.startpage_text)
if findings:
score += weight * len(findings)
if site.title and crit_re.search(site.title):
score += 4 * weight
if site.description and crit_re.search(site.description):
score += 4 * weight
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
return score >= MIN_RELEVANCE_SCORE
re_criteria = {
(
'anarch',
1.0,
('*',),
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
),
('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
}

View file

@ -0,0 +1,24 @@
"""
Plugin for filtering paths of a site to be retrieved.
This plugin implements :func:`sp_filter`.
"""
def sp_filter(site, path, robots) -> bool:
"""
Per-site path filter. Return whether the path shall be retrieved.
"""
if not robots.can_fetch_url(site.base_url + path):
return False
if 'amusewiki' in site.meta_info.get('generator', '').lower():
if any(
[
path.endswith(end)
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
]
):
return False
if '/bbselect?' in path:
return False
return True

63
doc/source/devel/devel.md Normal file
View file

@ -0,0 +1,63 @@
## Setup dev environment
1. You need python 3.9 or later.
1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
1. Clone the repo and setup a virtualenv:
```
cd YOUR_DEV_DIR
git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
cd atextcrawler
pipenv install -d
```
## Configure the instance
See [installation](installation.md).
## Run
```
python -m atextcrawler
```
## Logging
Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
```
journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
```
## Upgrading
Upgrade dev tools:
```
pre-commit autoupdate
```
## Test and clean manually
```
AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
mypy --ignore-missing-imports src/atextcrawler
isort src/atextcrawler
black -S -t py37 -l 79 src/atextcrawler
pybetter --exclude B004,B007,B008 src/atextcrawler
interrogate -i -I -m -v src/atextcrawler
```
## Release
There are no releases (currently).
## Useful commands
### Fetch a resource or a site manually
```
python -m atextcrawler.resource https://www.katesharpleylibrary.net/
python -m atextcrawler.site https://www.katesharpleylibrary.net/
```
### SQL
```
drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
-- stats: sites, paths, resources
select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
```

View file

@ -0,0 +1,64 @@
## Related work
* [collection of crawlers](https://github.com/adbar/awesome-crawler)
* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
### crawlers
* [acrawler](https://acrawler.readthedocs.io/en/latest/)
* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
* [repo](https://github.com/adbar/trafilatura)
* [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
* [scrapy](https://docs.scrapy.org/en/latest/)
* [heritrix3](https://github.com/internetarchive/heritrix3/)
* [YaCy](https://yacy.net/)
* [searchmysite](https://searchmysite.net/)
* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
#### general
* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
### sitemap parsers
* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
### url handling
* [courlan](https://pypi.org/project/courlan/)
### language detection
* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
* [guess_language](https://pypi.org/project/guess-language/)
* [cld3](https://github.com/google/cld3)
### text extraction
* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
### deduplication
* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
* remove paragraphs with more than 50% word-7-tuples encountered previously
### Extract more meta tags
* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
https://support.shareaholic.com/hc/en-us/articles/115003085186
### Date parsing dependent on language
* https://en.wikipedia.org/wiki/Date_format_by_country
* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
* https://pypi.org/project/dateparser/
* https://github.com/ovalhub/pyicu
* https://github.com/night-crawler/cldr-language-helpers
* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
ICU
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
* https://gist.github.com/dpk/8325992
* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
* https://unicode-org.github.io/icu/userguide/
* https://unicode-org.github.io/icu-docs/#/icu4c/
* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview

77
doc/source/devel/todo.md Normal file
View file

@ -0,0 +1,77 @@
## TODO
* parse html time tags
* site annotations:
* categories
* historical (no changes any more since n months)
* news
* local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
* allow for tls in elasticsearch config
* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
```
'&#8211;': '--',
'&ndash;': '--',
'': '--',
'&#8212;': '---',
'&mdash;': '---',
'—': '---',
'&#8230;': '...',
'&hellip;': '...',
'…': '...',
'&#8220;': '"',
'&#8221;': '"',
'&#8222;': '"',
'&#8243;': '"',
'&ldquo;': '"',
'&rdquo;': '"',
'&bdquo;': '"',
'&Prime;': '"',
'“':'"',
'”':'"',
'„':'"',
'″':'"',
'&#8216;':"'",
'&#8217;':"'",
'&#8242;':"'",
'&lsquo;':"'",
'&rsquo;':"'",
'&prime;':"'",
'':"'",
'':"'",
'':"'",
```
* normalize quotation marks and punctuation in general
* https://unicode-table.com/en/sets/quotation-marks/
* https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
* https://www.fileformat.info/info/unicode/category/Po/list.htm
* https://www.gaijin.at/en/infos/unicode-character-table-punctuation
* ⁝
* cancel crawls that take too long
* search for "TODO" in code
* feedparser has support for JSON feeds since commit
a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
(as of 2020-10-26 in "develop" branch, not part of a release)
the version names are 'json1' and 'json11'
* allow site URLs with path, e.g.
https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
* add more languages
## Ideas
* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
* [langid.py](https://github.com/saffsd/langid.py)
* [gain](https://github.com/gaojiuli/gain)
* [ruia](https://docs.python-ruia.org/)
* [demiurge](https://demiurge.readthedocs.io/)
* [cocrawler](https://github.com/cocrawler/cocrawler/)
* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)

View file

@ -0,0 +1,9 @@
Development
-----------
.. toctree::
:maxdepth: 2
devel/devel
devel/todo
devel/related_work

119
doc/source/elasticsearch.md Normal file
View file

@ -0,0 +1,119 @@
# Howto elasticsearch
## Prerequisites
On the host (virtualization host) we need:
```
# cat /etc/sysctl.d/virtual_memory.conf
vm.max_map_count=262144
# sysctl -p /etc/sysctl.d/virtual_memory.conf
```
If this cannot be done, change this file after installing or upgrading elasticsearch:
```
/usr/lib/sysctl.d/elasticsearch.conf
```
## Setup
### Install package
In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
We do a manual install. If you configure the apt repo instead, also think about setting
`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
```
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
dpkg -i elasticsearch-7.15.2-amd64.deb
systemctl daemon-reload
systemctl enable elasticsearch.service
systemctl start elasticsearch.service
```
First test:
```
http -j GET 127.0.0.1:9200/
```
### Storage
```
systemctl stop elasticsearch.service
mv /var/lib/elasticsearch/ /srv/
systemctl start elasticsearch.service
```
Edit /etc/elasticsearch/elasticsearch.yml
```
cluster.name: org.a-text.search
node.name: atext1
path.data: /srv/elasticsearch
path.logs: /var/log/elasticsearch
discovery.seed_hosts: ["atext1.multiname.org"]
xpack.security.enabled: true
xpack.security.authc.api_key.enabled: true
```
```
systemctl restart elasticsearch
```
The logfile now is at
```
/var/log/elasticsearch/org.a-text.search.log
```
### Setup passwords
Setup passwords:
```
# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
The passwords will be randomly generated and printed to the console.
Please confirm that you would like to continue [y/N]y
```
Copy output to /etc/elasticsearch/passwords and
```
chmod 400 /etc/elasticsearch/passwords
```
Check login as user elastic:
```
http --auth elastic:************** -j GET http://127.0.0.1:9200/
```
### Memory limitation
To limit memory usage
```
mkdir /etc/systemd/system/elasticsearch.service.d
cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
[Service]
LimitMEMLOCK=8G
systemctl stop elasticsearch
systemctl daemon-reload
systemctl start elasticsearch
EOF
```
and restart the service.
## Usage
Some useful requests:
### List indices
```
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
```
### Health
```
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
```
### Node attributes
```
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
```
### Create API key
```
http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
```

37
doc/source/index.rst Normal file
View file

@ -0,0 +1,37 @@
atextcrawler
============
atextcrawler is an asynchronous webcrawler indexing text
for literal and semantic search.
Its client-side counterpart is atextsearch_.
atextcrawler crawls and indexes selected websites.
It starts from a few seed sites and follows their external links.
Criteria defined in plugin code determine which linked sites (and
which of their resources) are (recursively) added to the pool.
atextcrawler is written in Python, runs a configurable number of
async workers concurrently (in one process), uses tensorflow for
embedding (paragraph-sized) text chunks in a (multi-)language model
and stores metadata in PostgreSQL and texts in elasticsearch.
.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
.. toctree::
:maxdepth: 2
:caption: Contents:
introduction
installation
maintenance
development
reference/modules
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

122
doc/source/installation.md Normal file
View file

@ -0,0 +1,122 @@
# Installation
Installation was only tested on Debian bullseye (on amd64).
The instructions below are for this system.
(Please adapt to other environments.)
## System packages
```
apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
```
The protobuf packages are required for python package gcld3 (see below).
## PostgreSQL database
We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
```
createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
```
## Elasticsearch
We need access to an elasticsearch instance (over TCP/IP).
Note: TLS is not yet supported, so install this service locally.
See [elasticsearch howto](elasticsearch.md).
## Tensorflow model server
We need access to a tensorflow model server (over TCP/IP).
It should serve `universal_sentence_encoder_multilingual`
or a similar language model.
Note: TLS is not yet supported, so install this service locally.
See [tensorflow howto](tensorflow_model_server.md).
## Setup virtualenv and install atextcrawler
```
apt install python3-pip
adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
su - atextcrawler
cat >>.bashrc <<EOF
export PYTHONPATH=\$HOME/repo/src
EOF
pip3 install --user pipenv
cat >>.profile <<EOF
PYTHONPATH=\$HOME/repo/src
PATH=\$HOME/.local/bin:$PATH
\$HOME/.local/bin/pipenv shell
EOF
exit
su - atextcrawler
git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
cd repo
pipenv sync
pipenv install --site-packages # for systemd
pre-commit install
```
Note: One of the dependencies, Python package `tldextract`,
uses this directory for caching:
```
$HOME/.cache/python-tldextract/
```
## Configure atextcrawler
As user `atextcrawler` execute
```
mkdir $HOME/.config
cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
```
Edit `$HOME/.config/atextcrawler/main.yaml`.
If you want to override a plugin, copy it to the plugins directory
and edit it, e.g.
```
cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
```
Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
Check (and print) the instance configuration:
```
python -m atextcrawler.config
```
## Test run
To see if it works, run `atextcrawler` from the command line:
```
python -m atextcrawler
```
You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
## Install systemd service
To make the service persistent, create a systemd unit file
`/etc/systemd/system/atextcrawler.service` with this content:
```
[Unit]
Description=atextcrawler web crawler
Documentation=https://gitea.multiname.org/a-text/atextcrawler
Requires=network.target
After=network-online.target
[Service]
Type=simple
User=atextcrawler
Group=atextcrawler
WorkingDirectory=/srv/atextcrawler/repo
Environment=PYTHONPATH=/srv/atextcrawler/repo/src
ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
TimeoutStartSec=30
ExecStop=/bin/kill -INT $MAINPID
TimeoutStopSec=180
Restart=on-failure
[Install]
WantedBy=multi-user.target
```
and
```
systemctl daemon-reload
systemctl enable atextcrawler
systemctl start atextcrawler
```

View file

@ -0,0 +1,66 @@
# Introduction
## What atextcrawler does:
* Start from a seed (white+black-)list of website base URLs
* Loop over sites selected by applying criteria to the content
of the site's start page
* Crawl the site, i.e. loop over resources of the site
* Extract plaintext content from the resource (html parsing is
optimized for html5); discard non-text content, but handle feeds
and sitemaps
* Extract internal and external links; external links contribute
to the site list
* Keep track of the sites and resources in a PostgreSQL database
* Store plaintext content of resources in an Elasticsearch index
* Store vector embeddings of plaintexts also in Elasticsearch
using tensorflow model server with a multilingual language model
## Architecture
There is only one python process running concurrently.
We use asyncio where possible (almost everywhere).
1. There is a queue of websites, see database table `site_queue`.
The queue is fed a) on first startup with seeds, b) manually
and c) from crawls which find external links.
When the queued is handled new sites are stored to table `site`.
New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
After the queue has been handled there is a delay
(`crawl.site_delay` seconds) before repetition.
1. Updating a site means: the start page is fetched and
criteria are applied to its content to determine whether
the site is relevant. (It is assumed that (non-)relevance is
obvious from the start page already.) If the site is relevant,
more information is fetched (e.g. sitemaps).
1. There is s a configurable number of crawler workers (config
`crawl.workers`) which concurrently crawl sites, one at a time
per worker. (During the crawl the site is marked as locked using
crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
Each crawl (with begin time, end time, number of found (new)
resources)) is stored in table `crawl`.
1. Crawls are either full crawls (including all paths reachable
through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
Feed crawls can happen more frequently (e.g. daily).
1. When a path is fetched it can result in a MetaResource (feed or
sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
1. If a MetaResource is fetched and it is a sitemap, its paths are
added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
1. Links between sites are stored in table `site_link`.
## Site annotations
Database table `site_annotation` can have any number of annotations
for a base_url. While crawling, these annotations are considered:
Blacklisting or whitelisting has precedence over function `site_filter`
(in plugin `filter_site`).
Annotations cannot be managed from within atextcrawler;
this requires another application, usually [`atextsearch`](https://TODO).
Each annotation requires a base_url of the annotated site and
if a site with this base_url exists in the `site` table,
it should also be associated with the site's id (column `site_id`).
## Limitations
* atextcrawler is not optimized for speed; it is meant to be run as a
background task on a server with limited resources
(or even an SBC, like raspberry pi, with attached storage)
* atextcrawler only indexes text, no other resources like images

23
doc/source/maintenance.md Normal file
View file

@ -0,0 +1,23 @@
# Maintenance
## Upgrading
```
su - atextcrawler
pip3 install --user --upgrade pipenv
cd repo
git pull
pipenv sync
systemctl restart atextcrawler
```
## Update tldextract
From time to time run (in the Python virtualenv):
```
tldextract --update
```
or
```
systemctl stop atextcrawler
rm -r $HOME/.cache/python-tldextract
systemctl start atextcrawler
```

View file

@ -0,0 +1,98 @@
# Tensorflow model server
## Setup server
Prepare:
```
apt install gnupg2
```
Add repo:
```
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
```
Install package:
```
apt update
apt install tensorflow-model-server
```
## Setup models
```
mkdir -p /srv/tensorflow/workdir
mkdir -p /srv/tensorflow/models
```
Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
```
# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
rm universal-sentence-encoder-multilingual_3.tar.gz
```
Check:
```
tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
```
Config file `/srv/tensorflow/config`:
```
model_config_list: {
config: {
name: "sentences",
base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
model_platform: "tensorflow"
model_version_policy: {latest{}},
},
config: {
... (next model)
},
}
```
## Systemd integration
Edit /etc/systemd/system/tensorflow.service
```
[Unit]
Description=tensorflow model server
After=network.target auditd.service
[Service]
Type=simple
WorkingDirectory=/srv/tensorflow/workdir
ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
KillMode=process
Restart=on-failure
RestartSec=30s
[Install]
WantedBy=multi-user.target
```
and
```
systemctl daemon-reload
systemctl enable tensorflow
systemctl start tensorflow
```
Check:
```
http -j GET http://localhost:9000/v1/models/sentences
```
## Usage
Show model details:
```
http -j GET http://localhost:9000/v1/models/sentences/metadata
```
## Docs
* `/usr/bin/tensorflow_model_server --help`
* https://github.com/tensorflow/serving/
* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
* https://github.com/hey-car/tensorflow-model-server
Datasets:
* https://www.tensorflow.org/datasets/catalog/overview