Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
23
doc/source/config_template/initial_data/seed_urls.list
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Initial URLs (first run only)
|
||||
#
|
||||
# To whitelist a URL prepend '+', to blacklist prepend '-'.
|
||||
# Comment lines must begin with '#'.
|
||||
|
||||
# de
|
||||
+http://agd.blogsport.de/
|
||||
+https://blackblogs.org/blogs/
|
||||
+https://fau.org/
|
||||
+http://anarchiv.de/
|
||||
+http://olaf.bbm.de/die-aktion
|
||||
-https://www.anarchistischefoderation.de/
|
||||
|
||||
# en
|
||||
+https://anarchistarchivist.com/
|
||||
+https://bookshelf.theanarchistlibrary.org/library/
|
||||
+https://archive.elephanteditions.net/library/
|
||||
+https://blackrosefed.org/
|
||||
+https://alpineanarchist.org/
|
||||
+https://nostate.net/
|
||||
+https://abolishing.blackblogs.org/
|
||||
+http://library.nothingness.org/
|
||||
-https://www.anarchistfederation.net/
|
88
doc/source/config_template/main.yaml
Normal file
88
doc/source/config_template/main.yaml
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Name of this instance
|
||||
# Default value: atextcrawler
|
||||
# Allowed values: arbitrary string
|
||||
instance_name: atextcrawler
|
||||
|
||||
# Which kind of instance is this?
|
||||
# Default value: prod
|
||||
# Allowed values are:
|
||||
# - 'dev': development instance
|
||||
# - 'staging': staging instance
|
||||
# - 'prod': production instance
|
||||
instance_type: prod
|
||||
|
||||
# Log level
|
||||
# Default value: info
|
||||
# Allowed values: critical, error, warning, info, debug
|
||||
log_level: info
|
||||
|
||||
# Plugins directory
|
||||
# If given as relative path, it will be relative to the
|
||||
# directory of this file (main.yaml).
|
||||
# Read documentation on plugins.
|
||||
# Default value: plugins
|
||||
# Hint: Create a empty __init__.py in the plugins_dir.
|
||||
plugins_dir: plugins
|
||||
|
||||
# Parameters for access to the PostgreSQL service
|
||||
# No default values; must be set.
|
||||
postgresql:
|
||||
host: localhost
|
||||
port: 5432
|
||||
database: atextcrawler
|
||||
user: atextcrawler
|
||||
password: ________________________
|
||||
|
||||
# Crawling
|
||||
crawl:
|
||||
# Number of concurrent workers
|
||||
# Default value: 10
|
||||
# Allowed values: integer >=0 and <=1000
|
||||
#workers: 3
|
||||
|
||||
# Delay in seconds between attempts to fetch items
|
||||
# from site_queue if the last attempt gave no item
|
||||
# Also the delay in seconds after a worker has found
|
||||
# no site to process
|
||||
# Default value: 600
|
||||
# Allowed values: positive number
|
||||
#site_delay: 10
|
||||
|
||||
# Time interval in seconds between site updates when
|
||||
# handling queued base URLs
|
||||
# Default value: 3600
|
||||
# Allowed values: positive number
|
||||
#site_revisit_interval: 3600
|
||||
|
||||
# Delay in seconds between attempts to process
|
||||
# individual resources (pages etc.) of a site
|
||||
# Default value: 5
|
||||
# Allowed values: positive number
|
||||
#resource_delay: 3
|
||||
|
||||
# Default interval in seconds between full crawls of a site
|
||||
# Default value: 864000 (10 days)
|
||||
# Allowed values: positive number
|
||||
#full_crawl_interval: 864000
|
||||
|
||||
# Default interval in seconds between feed crawls of a site
|
||||
# Default value: 86400 (1 day)
|
||||
# Allowed values: positive number
|
||||
#feed_crawl_interval: 86400
|
||||
|
||||
# Parameters for access to the ElasticSearch service
|
||||
# No default values; must be set.
|
||||
elasticsearch:
|
||||
# host on which ES is running
|
||||
host: localhost
|
||||
# API key for accessing ES
|
||||
api_key: "**********************"
|
||||
# API user id
|
||||
id: "**********************"
|
||||
# Index base name (actual index names will have '_text' etc. appended)
|
||||
index_base_name: atext
|
||||
|
||||
# Tensorflow access
|
||||
tensorflow:
|
||||
# The prediction endpoint of the model server's sentence model
|
||||
model_server_endpoint: http://localhost:9000/v1/models/sentences:predict
|
0
doc/source/config_template/plugins/__init__.py
Normal file
0
doc/source/config_template/plugins/__init__.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Filter paths found in a resource.
|
||||
|
||||
This plugin implements :func:`rp_filter`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def rp_filter(site, durl) -> Optional[str]:
|
||||
"""
|
||||
Adjust or filter found paths (may depend on site).
|
||||
|
||||
To filter out a path (i.e., not add it to table `site_path`)
|
||||
return None.
|
||||
"""
|
||||
path = durl.pwa()
|
||||
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||
return None
|
||||
path = path.removesuffix('?amp=1')
|
||||
return path
|
47
doc/source/config_template/plugins/filter_site.py
Normal file
47
doc/source/config_template/plugins/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Relevance estimation of sites.
|
||||
|
||||
This plugin implements :func:`site_filter`.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from atextcrawler.models import Site
|
||||
|
||||
MIN_RELEVANCE_SCORE = 5
|
||||
|
||||
|
||||
async def site_filter(site: Site) -> bool:
|
||||
"""
|
||||
Assess relevance of the site (using language-dependent criteria).
|
||||
|
||||
If the site shall be crawled, return True, else False.
|
||||
"""
|
||||
# limit to sites in English or German language
|
||||
if not set(['de', 'en']) & set(site.langs):
|
||||
return False
|
||||
score = 0.0
|
||||
for crit_name, weight, langs, crit_re in re_criteria:
|
||||
if '*' in langs or set(langs) & set(site.langs):
|
||||
findings = crit_re.findall(site.startpage_text)
|
||||
if findings:
|
||||
score += weight * len(findings)
|
||||
if site.title and crit_re.search(site.title):
|
||||
score += 4 * weight
|
||||
if site.description and crit_re.search(site.description):
|
||||
score += 4 * weight
|
||||
|
||||
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||
|
||||
return score >= MIN_RELEVANCE_SCORE
|
||||
|
||||
|
||||
re_criteria = {
|
||||
(
|
||||
'anarch',
|
||||
1.0,
|
||||
('*',),
|
||||
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||
),
|
||||
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||
}
|
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Plugin for filtering paths of a site to be retrieved.
|
||||
|
||||
This plugin implements :func:`sp_filter`.
|
||||
"""
|
||||
|
||||
|
||||
def sp_filter(site, path, robots) -> bool:
|
||||
"""
|
||||
Per-site path filter. Return whether the path shall be retrieved.
|
||||
"""
|
||||
if not robots.can_fetch_url(site.base_url + path):
|
||||
return False
|
||||
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||
if any(
|
||||
[
|
||||
path.endswith(end)
|
||||
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||
]
|
||||
):
|
||||
return False
|
||||
if '/bbselect?' in path:
|
||||
return False
|
||||
return True
|
Loading…
Add table
Add a link
Reference in a new issue