Put under version control

This commit is contained in:
ibu 2021-11-29 09:16:31 +00:00
parent d26d23348b
commit a6af5b12d2
83 changed files with 20130 additions and 0 deletions

View file

@ -0,0 +1,23 @@
# Initial URLs (first run only)
#
# To whitelist a URL prepend '+', to blacklist prepend '-'.
# Comment lines must begin with '#'.
# de
+http://agd.blogsport.de/
+https://blackblogs.org/blogs/
+https://fau.org/
+http://anarchiv.de/
+http://olaf.bbm.de/die-aktion
-https://www.anarchistischefoderation.de/
# en
+https://anarchistarchivist.com/
+https://bookshelf.theanarchistlibrary.org/library/
+https://archive.elephanteditions.net/library/
+https://blackrosefed.org/
+https://alpineanarchist.org/
+https://nostate.net/
+https://abolishing.blackblogs.org/
+http://library.nothingness.org/
-https://www.anarchistfederation.net/

View file

@ -0,0 +1,88 @@
# Name of this instance
# Default value: atextcrawler
# Allowed values: arbitrary string
instance_name: atextcrawler
# Which kind of instance is this?
# Default value: prod
# Allowed values are:
# - 'dev': development instance
# - 'staging': staging instance
# - 'prod': production instance
instance_type: prod
# Log level
# Default value: info
# Allowed values: critical, error, warning, info, debug
log_level: info
# Plugins directory
# If given as relative path, it will be relative to the
# directory of this file (main.yaml).
# Read documentation on plugins.
# Default value: plugins
# Hint: Create a empty __init__.py in the plugins_dir.
plugins_dir: plugins
# Parameters for access to the PostgreSQL service
# No default values; must be set.
postgresql:
host: localhost
port: 5432
database: atextcrawler
user: atextcrawler
password: ________________________
# Crawling
crawl:
# Number of concurrent workers
# Default value: 10
# Allowed values: integer >=0 and <=1000
#workers: 3
# Delay in seconds between attempts to fetch items
# from site_queue if the last attempt gave no item
# Also the delay in seconds after a worker has found
# no site to process
# Default value: 600
# Allowed values: positive number
#site_delay: 10
# Time interval in seconds between site updates when
# handling queued base URLs
# Default value: 3600
# Allowed values: positive number
#site_revisit_interval: 3600
# Delay in seconds between attempts to process
# individual resources (pages etc.) of a site
# Default value: 5
# Allowed values: positive number
#resource_delay: 3
# Default interval in seconds between full crawls of a site
# Default value: 864000 (10 days)
# Allowed values: positive number
#full_crawl_interval: 864000
# Default interval in seconds between feed crawls of a site
# Default value: 86400 (1 day)
# Allowed values: positive number
#feed_crawl_interval: 86400
# Parameters for access to the ElasticSearch service
# No default values; must be set.
elasticsearch:
# host on which ES is running
host: localhost
# API key for accessing ES
api_key: "**********************"
# API user id
id: "**********************"
# Index base name (actual index names will have '_text' etc. appended)
index_base_name: atext
# Tensorflow access
tensorflow:
# The prediction endpoint of the model server's sentence model
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict

View file

@ -0,0 +1,22 @@
"""
Filter paths found in a resource.
This plugin implements :func:`rp_filter`.
"""
from typing import Optional
def rp_filter(site, durl) -> Optional[str]:
"""
Adjust or filter found paths (may depend on site).
To filter out a path (i.e., not add it to table `site_path`)
return None.
"""
path = durl.pwa()
# skip fetching images (linked from a tags; img tags are skipped anyway)
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
return None
path = path.removesuffix('?amp=1')
return path

View file

@ -0,0 +1,47 @@
"""
Relevance estimation of sites.
This plugin implements :func:`site_filter`.
"""
import re
from atextcrawler.models import Site
MIN_RELEVANCE_SCORE = 5
async def site_filter(site: Site) -> bool:
"""
Assess relevance of the site (using language-dependent criteria).
If the site shall be crawled, return True, else False.
"""
# limit to sites in English or German language
if not set(['de', 'en']) & set(site.langs):
return False
score = 0.0
for crit_name, weight, langs, crit_re in re_criteria:
if '*' in langs or set(langs) & set(site.langs):
findings = crit_re.findall(site.startpage_text)
if findings:
score += weight * len(findings)
if site.title and crit_re.search(site.title):
score += 4 * weight
if site.description and crit_re.search(site.description):
score += 4 * weight
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
return score >= MIN_RELEVANCE_SCORE
re_criteria = {
(
'anarch',
1.0,
('*',),
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
),
('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
}

View file

@ -0,0 +1,24 @@
"""
Plugin for filtering paths of a site to be retrieved.
This plugin implements :func:`sp_filter`.
"""
def sp_filter(site, path, robots) -> bool:
"""
Per-site path filter. Return whether the path shall be retrieved.
"""
if not robots.can_fetch_url(site.base_url + path):
return False
if 'amusewiki' in site.meta_info.get('generator', '').lower():
if any(
[
path.endswith(end)
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
]
):
return False
if '/bbselect?' in path:
return False
return True