Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
71
doc/source/conf.py
Normal file
71
doc/source/conf.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.'))))
|
||||
import os
|
||||
import sys
|
||||
|
||||
proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.')))
|
||||
sys.path.insert(0, proj_dir + '/src')
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'atextcrawler'
|
||||
copyright = '2021, ibu radempa'
|
||||
author = 'ibu radempa'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.1.0'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.autosummary',
|
||||
'myst_parser',
|
||||
'sphinx.ext.graphviz',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
autosummary_generate = True
|
||||
|
||||
source_suffix = {
|
||||
'.rst': 'restructuredtext',
|
||||
'.md': 'markdown',
|
||||
}
|
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Initial URLs (first run only)
|
||||
#
|
||||
# To whitelist a URL prepend '+', to blacklist prepend '-'.
|
||||
# Comment lines must begin with '#'.
|
||||
|
||||
# de
|
||||
+http://agd.blogsport.de/
|
||||
+https://blackblogs.org/blogs/
|
||||
+https://fau.org/
|
||||
+http://anarchiv.de/
|
||||
+http://olaf.bbm.de/die-aktion
|
||||
-https://www.anarchistischefoderation.de/
|
||||
|
||||
# en
|
||||
+https://anarchistarchivist.com/
|
||||
+https://bookshelf.theanarchistlibrary.org/library/
|
||||
+https://archive.elephanteditions.net/library/
|
||||
+https://blackrosefed.org/
|
||||
+https://alpineanarchist.org/
|
||||
+https://nostate.net/
|
||||
+https://abolishing.blackblogs.org/
|
||||
+http://library.nothingness.org/
|
||||
-https://www.anarchistfederation.net/
|
88
doc/source/config_template/main.yaml
Normal file
88
doc/source/config_template/main.yaml
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Name of this instance
|
||||
# Default value: atextcrawler
|
||||
# Allowed values: arbitrary string
|
||||
instance_name: atextcrawler
|
||||
|
||||
# Which kind of instance is this?
|
||||
# Default value: prod
|
||||
# Allowed values are:
|
||||
# - 'dev': development instance
|
||||
# - 'staging': staging instance
|
||||
# - 'prod': production instance
|
||||
instance_type: prod
|
||||
|
||||
# Log level
|
||||
# Default value: info
|
||||
# Allowed values: critical, error, warning, info, debug
|
||||
log_level: info
|
||||
|
||||
# Plugins directory
|
||||
# If given as relative path, it will be relative to the
|
||||
# directory of this file (main.yaml).
|
||||
# Read documentation on plugins.
|
||||
# Default value: plugins
|
||||
# Hint: Create a empty __init__.py in the plugins_dir.
|
||||
plugins_dir: plugins
|
||||
|
||||
# Parameters for access to the PostgreSQL service
|
||||
# No default values; must be set.
|
||||
postgresql:
|
||||
host: localhost
|
||||
port: 5432
|
||||
database: atextcrawler
|
||||
user: atextcrawler
|
||||
password: ________________________
|
||||
|
||||
# Crawling
|
||||
crawl:
|
||||
# Number of concurrent workers
|
||||
# Default value: 10
|
||||
# Allowed values: integer >=0 and <=1000
|
||||
#workers: 3
|
||||
|
||||
# Delay in seconds between attempts to fetch items
|
||||
# from site_queue if the last attempt gave no item
|
||||
# Also the delay in seconds after a worker has found
|
||||
# no site to process
|
||||
# Default value: 600
|
||||
# Allowed values: positive number
|
||||
#site_delay: 10
|
||||
|
||||
# Time interval in seconds between site updates when
|
||||
# handling queued base URLs
|
||||
# Default value: 3600
|
||||
# Allowed values: positive number
|
||||
#site_revisit_interval: 3600
|
||||
|
||||
# Delay in seconds between attempts to process
|
||||
# individual resources (pages etc.) of a site
|
||||
# Default value: 5
|
||||
# Allowed values: positive number
|
||||
#resource_delay: 3
|
||||
|
||||
# Default interval in seconds between full crawls of a site
|
||||
# Default value: 864000 (10 days)
|
||||
# Allowed values: positive number
|
||||
#full_crawl_interval: 864000
|
||||
|
||||
# Default interval in seconds between feed crawls of a site
|
||||
# Default value: 86400 (1 day)
|
||||
# Allowed values: positive number
|
||||
#feed_crawl_interval: 86400
|
||||
|
||||
# Parameters for access to the ElasticSearch service
|
||||
# No default values; must be set.
|
||||
elasticsearch:
|
||||
# host on which ES is running
|
||||
host: localhost
|
||||
# API key for accessing ES
|
||||
api_key: "**********************"
|
||||
# API user id
|
||||
id: "**********************"
|
||||
# Index base name (actual index names will have '_text' etc. appended)
|
||||
index_base_name: atext
|
||||
|
||||
# Tensorflow access
|
||||
tensorflow:
|
||||
# The prediction endpoint of the model server's sentence model
|
||||
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|
0
doc/source/config_template/plugins/__init__.py
Normal file
0
doc/source/config_template/plugins/__init__.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Filter paths found in a resource.
|
||||
|
||||
This plugin implements :func:`rp_filter`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def rp_filter(site, durl) -> Optional[str]:
|
||||
"""
|
||||
Adjust or filter found paths (may depend on site).
|
||||
|
||||
To filter out a path (i.e., not add it to table `site_path`)
|
||||
return None.
|
||||
"""
|
||||
path = durl.pwa()
|
||||
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||
return None
|
||||
path = path.removesuffix('?amp=1')
|
||||
return path
|
47
doc/source/config_template/plugins/filter_site.py
Normal file
47
doc/source/config_template/plugins/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Relevance estimation of sites.
|
||||
|
||||
This plugin implements :func:`site_filter`.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from atextcrawler.models import Site
|
||||
|
||||
MIN_RELEVANCE_SCORE = 5
|
||||
|
||||
|
||||
async def site_filter(site: Site) -> bool:
|
||||
"""
|
||||
Assess relevance of the site (using language-dependent criteria).
|
||||
|
||||
If the site shall be crawled, return True, else False.
|
||||
"""
|
||||
# limit to sites in English or German language
|
||||
if not set(['de', 'en']) & set(site.langs):
|
||||
return False
|
||||
score = 0.0
|
||||
for crit_name, weight, langs, crit_re in re_criteria:
|
||||
if '*' in langs or set(langs) & set(site.langs):
|
||||
findings = crit_re.findall(site.startpage_text)
|
||||
if findings:
|
||||
score += weight * len(findings)
|
||||
if site.title and crit_re.search(site.title):
|
||||
score += 4 * weight
|
||||
if site.description and crit_re.search(site.description):
|
||||
score += 4 * weight
|
||||
|
||||
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||
|
||||
return score >= MIN_RELEVANCE_SCORE
|
||||
|
||||
|
||||
re_criteria = {
|
||||
(
|
||||
'anarch',
|
||||
1.0,
|
||||
('*',),
|
||||
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||
),
|
||||
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||
}
|
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Plugin for filtering paths of a site to be retrieved.
|
||||
|
||||
This plugin implements :func:`sp_filter`.
|
||||
"""
|
||||
|
||||
|
||||
def sp_filter(site, path, robots) -> bool:
|
||||
"""
|
||||
Per-site path filter. Return whether the path shall be retrieved.
|
||||
"""
|
||||
if not robots.can_fetch_url(site.base_url + path):
|
||||
return False
|
||||
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||
if any(
|
||||
[
|
||||
path.endswith(end)
|
||||
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||
]
|
||||
):
|
||||
return False
|
||||
if '/bbselect?' in path:
|
||||
return False
|
||||
return True
|
63
doc/source/devel/devel.md
Normal file
63
doc/source/devel/devel.md
Normal file
|
@ -0,0 +1,63 @@
|
|||
## Setup dev environment
|
||||
1. You need python 3.9 or later.
|
||||
1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv`
|
||||
1. Clone the repo and setup a virtualenv:
|
||||
```
|
||||
cd YOUR_DEV_DIR
|
||||
git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git
|
||||
cd atextcrawler
|
||||
pipenv install -d
|
||||
```
|
||||
|
||||
## Configure the instance
|
||||
See [installation](installation.md).
|
||||
|
||||
## Run
|
||||
```
|
||||
python -m atextcrawler
|
||||
```
|
||||
|
||||
## Logging
|
||||
Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages:
|
||||
```
|
||||
journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev
|
||||
```
|
||||
|
||||
## Upgrading
|
||||
Upgrade dev tools:
|
||||
```
|
||||
pre-commit autoupdate
|
||||
```
|
||||
|
||||
## Test and clean manually
|
||||
```
|
||||
AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover
|
||||
mypy --ignore-missing-imports src/atextcrawler
|
||||
isort src/atextcrawler
|
||||
black -S -t py37 -l 79 src/atextcrawler
|
||||
pybetter --exclude B004,B007,B008 src/atextcrawler
|
||||
interrogate -i -I -m -v src/atextcrawler
|
||||
```
|
||||
|
||||
## Release
|
||||
There are no releases (currently).
|
||||
|
||||
## Useful commands
|
||||
|
||||
### Fetch a resource or a site manually
|
||||
```
|
||||
python -m atextcrawler.resource https://www.katesharpleylibrary.net/
|
||||
python -m atextcrawler.site https://www.katesharpleylibrary.net/
|
||||
```
|
||||
|
||||
### SQL
|
||||
```
|
||||
drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs;
|
||||
|
||||
http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_*
|
||||
|
||||
http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices
|
||||
|
||||
-- stats: sites, paths, resources
|
||||
select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id;
|
||||
```
|
64
doc/source/devel/related_work.md
Normal file
64
doc/source/devel/related_work.md
Normal file
|
@ -0,0 +1,64 @@
|
|||
## Related work
|
||||
* [collection of crawlers](https://github.com/adbar/awesome-crawler)
|
||||
* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper)
|
||||
|
||||
### crawlers
|
||||
* [acrawler](https://acrawler.readthedocs.io/en/latest/)
|
||||
* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html)
|
||||
* [repo](https://github.com/adbar/trafilatura)
|
||||
* [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html)
|
||||
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/)
|
||||
* [scrapy](https://docs.scrapy.org/en/latest/)
|
||||
* [heritrix3](https://github.com/internetarchive/heritrix3/)
|
||||
* [YaCy](https://yacy.net/)
|
||||
* [searchmysite](https://searchmysite.net/)
|
||||
* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz)
|
||||
* [aiohttp_spider](https://github.com/niklak/aiohttp_spider)
|
||||
* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika
|
||||
* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi)
|
||||
|
||||
#### general
|
||||
* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search)
|
||||
|
||||
### sitemap parsers
|
||||
* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser)
|
||||
|
||||
### url handling
|
||||
* [courlan](https://pypi.org/project/courlan/)
|
||||
|
||||
### language detection
|
||||
* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language)
|
||||
* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/)
|
||||
* [guess_language](https://pypi.org/project/guess-language/)
|
||||
* [cld3](https://github.com/google/cld3)
|
||||
|
||||
### text extraction
|
||||
* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/)
|
||||
|
||||
### deduplication
|
||||
* [PostgreSQL extension smlar](https://github.com/jirutka/smlar)
|
||||
* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3)
|
||||
* remove paragraphs with more than 50% word-7-tuples encountered previously
|
||||
|
||||
### Extract more meta tags
|
||||
* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md
|
||||
https://support.shareaholic.com/hc/en-us/articles/115003085186
|
||||
|
||||
### Date parsing dependent on language
|
||||
* https://en.wikipedia.org/wiki/Date_format_by_country
|
||||
* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository
|
||||
* https://pypi.org/project/dateparser/
|
||||
* https://github.com/ovalhub/pyicu
|
||||
* https://github.com/night-crawler/cldr-language-helpers
|
||||
* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language
|
||||
|
||||
ICU
|
||||
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse
|
||||
* https://gist.github.com/dpk/8325992
|
||||
* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html
|
||||
* https://unicode-org.github.io/icu/userguide/
|
||||
* https://unicode-org.github.io/icu-docs/#/icu4c/
|
||||
* https://github.com/ovalhub/pyicu/blob/master/samples/break.py
|
||||
* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
|
||||
* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras
|
||||
* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview
|
77
doc/source/devel/todo.md
Normal file
77
doc/source/devel/todo.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
## TODO
|
||||
|
||||
* parse html time tags
|
||||
|
||||
* site annotations:
|
||||
* categories
|
||||
* historical (no changes any more since n months)
|
||||
* news
|
||||
* local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip
|
||||
|
||||
* allow for tls in elasticsearch config
|
||||
|
||||
* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py
|
||||
```
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'–': '--',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'—': '---',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'…': '...',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'„': '"',
|
||||
'″': '"',
|
||||
'“': '"',
|
||||
'”': '"',
|
||||
'„': '"',
|
||||
'″': '"',
|
||||
'“':'"',
|
||||
'”':'"',
|
||||
'„':'"',
|
||||
'″':'"',
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
'‘':"'",
|
||||
'’':"'",
|
||||
'′':"'",
|
||||
```
|
||||
* normalize quotation marks and punctuation in general
|
||||
* https://unicode-table.com/en/sets/quotation-marks/
|
||||
* https://github.com/avian2/unidecode/blob/master/unidecode/x020.py
|
||||
* https://www.fileformat.info/info/unicode/category/Po/list.htm
|
||||
* https://www.gaijin.at/en/infos/unicode-character-table-punctuation
|
||||
* ⁝
|
||||
|
||||
* cancel crawls that take too long
|
||||
|
||||
* search for "TODO" in code
|
||||
|
||||
* feedparser has support for JSON feeds since commit
|
||||
a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc
|
||||
(as of 2020-10-26 in "develop" branch, not part of a release)
|
||||
the version names are 'json1' and 'json11'
|
||||
|
||||
* allow site URLs with path, e.g.
|
||||
https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/
|
||||
|
||||
* add more languages
|
||||
|
||||
## Ideas
|
||||
* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives
|
||||
|
||||
* [space-langdetect](https://pypi.org/project/spacy-langdetect/)
|
||||
* [langid.py](https://github.com/saffsd/langid.py)
|
||||
|
||||
* [gain](https://github.com/gaojiuli/gain)
|
||||
* [ruia](https://docs.python-ruia.org/)
|
||||
* [demiurge](https://demiurge.readthedocs.io/)
|
||||
* [cocrawler](https://github.com/cocrawler/cocrawler/)
|
||||
* [aiocrawler](https://github.com/tapanpandita/aiocrawler/)
|
9
doc/source/development.rst
Normal file
9
doc/source/development.rst
Normal file
|
@ -0,0 +1,9 @@
|
|||
Development
|
||||
-----------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
devel/devel
|
||||
devel/todo
|
||||
devel/related_work
|
119
doc/source/elasticsearch.md
Normal file
119
doc/source/elasticsearch.md
Normal file
|
@ -0,0 +1,119 @@
|
|||
# Howto elasticsearch
|
||||
|
||||
## Prerequisites
|
||||
On the host (virtualization host) we need:
|
||||
```
|
||||
# cat /etc/sysctl.d/virtual_memory.conf
|
||||
vm.max_map_count=262144
|
||||
# sysctl -p /etc/sysctl.d/virtual_memory.conf
|
||||
```
|
||||
|
||||
If this cannot be done, change this file after installing or upgrading elasticsearch:
|
||||
```
|
||||
/usr/lib/sysctl.d/elasticsearch.conf
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### Install package
|
||||
In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html).
|
||||
|
||||
We do a manual install. If you configure the apt repo instead, also think about setting
|
||||
`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`.
|
||||
|
||||
```
|
||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb
|
||||
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512
|
||||
shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512
|
||||
dpkg -i elasticsearch-7.15.2-amd64.deb
|
||||
systemctl daemon-reload
|
||||
systemctl enable elasticsearch.service
|
||||
systemctl start elasticsearch.service
|
||||
```
|
||||
|
||||
First test:
|
||||
```
|
||||
http -j GET 127.0.0.1:9200/
|
||||
```
|
||||
|
||||
### Storage
|
||||
|
||||
```
|
||||
systemctl stop elasticsearch.service
|
||||
mv /var/lib/elasticsearch/ /srv/
|
||||
systemctl start elasticsearch.service
|
||||
```
|
||||
|
||||
Edit /etc/elasticsearch/elasticsearch.yml
|
||||
```
|
||||
cluster.name: org.a-text.search
|
||||
node.name: atext1
|
||||
path.data: /srv/elasticsearch
|
||||
path.logs: /var/log/elasticsearch
|
||||
discovery.seed_hosts: ["atext1.multiname.org"]
|
||||
xpack.security.enabled: true
|
||||
xpack.security.authc.api_key.enabled: true
|
||||
```
|
||||
|
||||
```
|
||||
systemctl restart elasticsearch
|
||||
```
|
||||
|
||||
The logfile now is at
|
||||
```
|
||||
/var/log/elasticsearch/org.a-text.search.log
|
||||
```
|
||||
|
||||
### Setup passwords
|
||||
Setup passwords:
|
||||
```
|
||||
# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto
|
||||
Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user.
|
||||
The passwords will be randomly generated and printed to the console.
|
||||
Please confirm that you would like to continue [y/N]y
|
||||
```
|
||||
|
||||
Copy output to /etc/elasticsearch/passwords and
|
||||
```
|
||||
chmod 400 /etc/elasticsearch/passwords
|
||||
```
|
||||
|
||||
Check login as user elastic:
|
||||
```
|
||||
http --auth elastic:************** -j GET http://127.0.0.1:9200/
|
||||
```
|
||||
|
||||
### Memory limitation
|
||||
To limit memory usage
|
||||
```
|
||||
mkdir /etc/systemd/system/elasticsearch.service.d
|
||||
cat >/etc/systemd/system/elasticsearch.service.d/override.conf <<EOF
|
||||
[Service]
|
||||
LimitMEMLOCK=8G
|
||||
|
||||
systemctl stop elasticsearch
|
||||
systemctl daemon-reload
|
||||
systemctl start elasticsearch
|
||||
EOF
|
||||
```
|
||||
and restart the service.
|
||||
|
||||
## Usage
|
||||
Some useful requests:
|
||||
|
||||
### List indices
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/indices
|
||||
```
|
||||
### Health
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/health
|
||||
```
|
||||
### Node attributes
|
||||
```
|
||||
http --auth elastic:$PASS -j GET http://127.0.0.1:9200/_cat/nodeattrs
|
||||
```
|
||||
### Create API key
|
||||
```
|
||||
http --auth elastic:$PASS -j POST http://127.0.0.1:9200/_security/api_key name=anarchism role_descriptors:='{"anarchism": {"cluster": [], "index": [{"names": ["anarchism_*"], "privileges": ["all"]}]}}'
|
||||
```
|
37
doc/source/index.rst
Normal file
37
doc/source/index.rst
Normal file
|
@ -0,0 +1,37 @@
|
|||
atextcrawler
|
||||
============
|
||||
|
||||
atextcrawler is an asynchronous webcrawler indexing text
|
||||
for literal and semantic search.
|
||||
|
||||
Its client-side counterpart is atextsearch_.
|
||||
|
||||
atextcrawler crawls and indexes selected websites.
|
||||
It starts from a few seed sites and follows their external links.
|
||||
Criteria defined in plugin code determine which linked sites (and
|
||||
which of their resources) are (recursively) added to the pool.
|
||||
|
||||
atextcrawler is written in Python, runs a configurable number of
|
||||
async workers concurrently (in one process), uses tensorflow for
|
||||
embedding (paragraph-sized) text chunks in a (multi-)language model
|
||||
and stores metadata in PostgreSQL and texts in elasticsearch.
|
||||
|
||||
.. _atextsearch: https://gitea.multiname.org/a-text/atextsearch
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
introduction
|
||||
installation
|
||||
maintenance
|
||||
development
|
||||
reference/modules
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
122
doc/source/installation.md
Normal file
122
doc/source/installation.md
Normal file
|
@ -0,0 +1,122 @@
|
|||
# Installation
|
||||
Installation was only tested on Debian bullseye (on amd64).
|
||||
The instructions below are for this system.
|
||||
(Please adapt to other environments.)
|
||||
|
||||
## System packages
|
||||
```
|
||||
apt install pandoc tidy python3-systemd protobuf-compiler libprotobuf-dev
|
||||
```
|
||||
The protobuf packages are required for python package gcld3 (see below).
|
||||
|
||||
## PostgreSQL database
|
||||
We need access to a PostgreSQL database. Install PostgreSQL or provide connectivity to a PostgreSQL database over TCP/IP. Create a new database:
|
||||
```
|
||||
createdb -E UTF8 --lc-collate=C --lc-ctype=C -T template0 -O atextcrawler atextcrawler
|
||||
```
|
||||
|
||||
## Elasticsearch
|
||||
We need access to an elasticsearch instance (over TCP/IP).
|
||||
|
||||
Note: TLS is not yet supported, so install this service locally.
|
||||
|
||||
See [elasticsearch howto](elasticsearch.md).
|
||||
|
||||
## Tensorflow model server
|
||||
We need access to a tensorflow model server (over TCP/IP).
|
||||
It should serve `universal_sentence_encoder_multilingual`
|
||||
or a similar language model.
|
||||
|
||||
Note: TLS is not yet supported, so install this service locally.
|
||||
|
||||
See [tensorflow howto](tensorflow_model_server.md).
|
||||
|
||||
## Setup virtualenv and install atextcrawler
|
||||
```
|
||||
apt install python3-pip
|
||||
adduser --home /srv/atextcrawler --disabled-password --gecos "" atextcrawler
|
||||
su - atextcrawler
|
||||
cat >>.bashrc <<EOF
|
||||
export PYTHONPATH=\$HOME/repo/src
|
||||
EOF
|
||||
pip3 install --user pipenv
|
||||
cat >>.profile <<EOF
|
||||
PYTHONPATH=\$HOME/repo/src
|
||||
PATH=\$HOME/.local/bin:$PATH
|
||||
\$HOME/.local/bin/pipenv shell
|
||||
EOF
|
||||
exit
|
||||
su - atextcrawler
|
||||
git clone https://gitea.multiname.org/a-text/atextcrawler.git repo
|
||||
cd repo
|
||||
pipenv sync
|
||||
pipenv install --site-packages # for systemd
|
||||
pre-commit install
|
||||
```
|
||||
|
||||
Note: One of the dependencies, Python package `tldextract`,
|
||||
uses this directory for caching:
|
||||
```
|
||||
$HOME/.cache/python-tldextract/
|
||||
```
|
||||
|
||||
## Configure atextcrawler
|
||||
As user `atextcrawler` execute
|
||||
```
|
||||
mkdir $HOME/.config
|
||||
cp -r $HOME/repo/doc/source/config_template $HOME/.config/atextcrawler
|
||||
```
|
||||
|
||||
Edit `$HOME/.config/atextcrawler/main.yaml`.
|
||||
|
||||
If you want to override a plugin, copy it to the plugins directory
|
||||
and edit it, e.g.
|
||||
```
|
||||
cp /srv/atextcrawler/repo/src/atextcrawler/plugin_defaults/filter_site.py $HOME/.config/plugins
|
||||
```
|
||||
|
||||
Optionally edit `$HOME/.config/atextcrawler/initial_data/seed_urls.list`.
|
||||
|
||||
Check (and print) the instance configuration:
|
||||
```
|
||||
python -m atextcrawler.config
|
||||
```
|
||||
|
||||
## Test run
|
||||
To see if it works, run `atextcrawler` from the command line:
|
||||
```
|
||||
python -m atextcrawler
|
||||
```
|
||||
You can stop it with `Ctrl-C`; stopping may take a few seconds or even minutes.
|
||||
|
||||
## Install systemd service
|
||||
To make the service persistent, create a systemd unit file
|
||||
`/etc/systemd/system/atextcrawler.service` with this content:
|
||||
```
|
||||
[Unit]
|
||||
Description=atextcrawler web crawler
|
||||
Documentation=https://gitea.multiname.org/a-text/atextcrawler
|
||||
Requires=network.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=atextcrawler
|
||||
Group=atextcrawler
|
||||
WorkingDirectory=/srv/atextcrawler/repo
|
||||
Environment=PYTHONPATH=/srv/atextcrawler/repo/src
|
||||
ExecStart=/srv/atextcrawler/.local/bin/pipenv run python -m atextcrawler
|
||||
TimeoutStartSec=30
|
||||
ExecStop=/bin/kill -INT $MAINPID
|
||||
TimeoutStopSec=180
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
and
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable atextcrawler
|
||||
systemctl start atextcrawler
|
||||
```
|
66
doc/source/introduction.md
Normal file
66
doc/source/introduction.md
Normal file
|
@ -0,0 +1,66 @@
|
|||
# Introduction
|
||||
|
||||
## What atextcrawler does:
|
||||
* Start from a seed (white+black-)list of website base URLs
|
||||
* Loop over sites selected by applying criteria to the content
|
||||
of the site's start page
|
||||
* Crawl the site, i.e. loop over resources of the site
|
||||
* Extract plaintext content from the resource (html parsing is
|
||||
optimized for html5); discard non-text content, but handle feeds
|
||||
and sitemaps
|
||||
* Extract internal and external links; external links contribute
|
||||
to the site list
|
||||
* Keep track of the sites and resources in a PostgreSQL database
|
||||
* Store plaintext content of resources in an Elasticsearch index
|
||||
* Store vector embeddings of plaintexts also in Elasticsearch
|
||||
using tensorflow model server with a multilingual language model
|
||||
|
||||
## Architecture
|
||||
There is only one python process running concurrently.
|
||||
We use asyncio where possible (almost everywhere).
|
||||
|
||||
1. There is a queue of websites, see database table `site_queue`.
|
||||
The queue is fed a) on first startup with seeds, b) manually
|
||||
and c) from crawls which find external links.
|
||||
When the queued is handled new sites are stored to table `site`.
|
||||
New sites are updated, existing sites only if the last update was more than `crawl.site_revisit_delay` seconds in the past.
|
||||
After the queue has been handled there is a delay
|
||||
(`crawl.site_delay` seconds) before repetition.
|
||||
1. Updating a site means: the start page is fetched and
|
||||
criteria are applied to its content to determine whether
|
||||
the site is relevant. (It is assumed that (non-)relevance is
|
||||
obvious from the start page already.) If the site is relevant,
|
||||
more information is fetched (e.g. sitemaps).
|
||||
1. There is s a configurable number of crawler workers (config
|
||||
`crawl.workers`) which concurrently crawl sites, one at a time
|
||||
per worker. (During the crawl the site is marked as locked using
|
||||
crawl_active=true.) They pick a relevant site which has not been crawled for a certain time ("checkout"), crawl it, and finally mark it as crawled (crawl_active=false, "checkin") and schedule the next crawl.
|
||||
Each crawl (with begin time, end time, number of found (new)
|
||||
resources)) is stored in table `crawl`.
|
||||
1. Crawls are either full crawls (including all paths reachable
|
||||
through links from the start page are fetched) or feed crawls (only paths listed in a feed of the site are fetched). The respective (minimum) intervals in which these crawls happens are `full_crawl_interval` and `feed_crawl_interval`.
|
||||
Feed crawls can happen more frequently (e.g. daily).
|
||||
1. When a path is fetched it can result in a MetaResource (feed or
|
||||
sitemap) or a TextResource (redirects are followed and irrelevant content is ignored). A TextResource obtained from a path can be very similar to a resource obtained from another path; in this case no new resource is created, but both paths are linked to the same resource (see tables `site_path` and `resource`).
|
||||
1. If a MetaResource is fetched and it is a sitemap, its paths are
|
||||
added to table `site_path`. If it is a feed, the feed is stored in table `site_feed` and its paths are added to table `site_path`.
|
||||
1. Links between sites are stored in table `site_link`.
|
||||
|
||||
## Site annotations
|
||||
Database table `site_annotation` can have any number of annotations
|
||||
for a base_url. While crawling, these annotations are considered:
|
||||
Blacklisting or whitelisting has precedence over function `site_filter`
|
||||
(in plugin `filter_site`).
|
||||
|
||||
Annotations cannot be managed from within atextcrawler;
|
||||
this requires another application, usually [`atextsearch`](https://TODO).
|
||||
|
||||
Each annotation requires a base_url of the annotated site and
|
||||
if a site with this base_url exists in the `site` table,
|
||||
it should also be associated with the site's id (column `site_id`).
|
||||
|
||||
## Limitations
|
||||
* atextcrawler is not optimized for speed; it is meant to be run as a
|
||||
background task on a server with limited resources
|
||||
(or even an SBC, like raspberry pi, with attached storage)
|
||||
* atextcrawler only indexes text, no other resources like images
|
23
doc/source/maintenance.md
Normal file
23
doc/source/maintenance.md
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Maintenance
|
||||
|
||||
## Upgrading
|
||||
```
|
||||
su - atextcrawler
|
||||
pip3 install --user --upgrade pipenv
|
||||
cd repo
|
||||
git pull
|
||||
pipenv sync
|
||||
systemctl restart atextcrawler
|
||||
```
|
||||
|
||||
## Update tldextract
|
||||
From time to time run (in the Python virtualenv):
|
||||
```
|
||||
tldextract --update
|
||||
```
|
||||
or
|
||||
```
|
||||
systemctl stop atextcrawler
|
||||
rm -r $HOME/.cache/python-tldextract
|
||||
systemctl start atextcrawler
|
||||
```
|
98
doc/source/tensorflow_model_server.md
Normal file
98
doc/source/tensorflow_model_server.md
Normal file
|
@ -0,0 +1,98 @@
|
|||
# Tensorflow model server
|
||||
|
||||
## Setup server
|
||||
Prepare:
|
||||
```
|
||||
apt install gnupg2
|
||||
```
|
||||
Add repo:
|
||||
```
|
||||
echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
|
||||
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
|
||||
```
|
||||
Install package:
|
||||
```
|
||||
apt update
|
||||
apt install tensorflow-model-server
|
||||
```
|
||||
|
||||
## Setup models
|
||||
```
|
||||
mkdir -p /srv/tensorflow/workdir
|
||||
mkdir -p /srv/tensorflow/models
|
||||
```
|
||||
Choose models from [tfhub.dev](https://tfhub.dev/) and for each do:
|
||||
```
|
||||
# example: https://tfhub.dev/google/universal-sentence-encoder-multilingual/3
|
||||
mkdir -p /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||
cd /srv/tensorflow/models/universal-sentence-encoder-multilingual/3
|
||||
wget https://tfhub.dev/google/universal-sentence-encoder-multilingual/3?tf-hub-format=compressed
|
||||
tar xvfz universal-sentence-encoder-multilingual_3.tar.gz
|
||||
rm universal-sentence-encoder-multilingual_3.tar.gz
|
||||
```
|
||||
|
||||
Check:
|
||||
```
|
||||
tensorflow_model_server --rest_api_port=9000 --model_base_path="/srv/tensorflow/models/universal-sentence-encoder-multilingual/" --model_name=sentences
|
||||
```
|
||||
|
||||
Config file `/srv/tensorflow/config`:
|
||||
```
|
||||
model_config_list: {
|
||||
config: {
|
||||
name: "sentences",
|
||||
base_path: "/srv/tensorflow/models/universal-sentence-encoder-multilingual",
|
||||
model_platform: "tensorflow"
|
||||
model_version_policy: {latest{}},
|
||||
},
|
||||
config: {
|
||||
... (next model)
|
||||
},
|
||||
}
|
||||
```
|
||||
|
||||
## Systemd integration
|
||||
Edit /etc/systemd/system/tensorflow.service
|
||||
```
|
||||
[Unit]
|
||||
Description=tensorflow model server
|
||||
After=network.target auditd.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/srv/tensorflow/workdir
|
||||
ExecStart=/usr/bin/tensorflow_model_server --rest_api_port=9000 --model_config_file=/srv/tensorflow/config
|
||||
KillMode=process
|
||||
Restart=on-failure
|
||||
RestartSec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
```
|
||||
and
|
||||
```
|
||||
systemctl daemon-reload
|
||||
systemctl enable tensorflow
|
||||
systemctl start tensorflow
|
||||
```
|
||||
|
||||
Check:
|
||||
```
|
||||
http -j GET http://localhost:9000/v1/models/sentences
|
||||
```
|
||||
|
||||
## Usage
|
||||
Show model details:
|
||||
```
|
||||
http -j GET http://localhost:9000/v1/models/sentences/metadata
|
||||
```
|
||||
|
||||
## Docs
|
||||
|
||||
* `/usr/bin/tensorflow_model_server --help`
|
||||
* https://github.com/tensorflow/serving/
|
||||
* [REST API](https://www.tensorflow.org/tfx/serving/api_rest)
|
||||
* https://github.com/hey-car/tensorflow-model-server
|
||||
|
||||
Datasets:
|
||||
* https://www.tensorflow.org/datasets/catalog/overview
|
Loading…
Add table
Add a link
Reference in a new issue