Put under version control

This commit is contained in:
ibu 2021-11-29 09:16:31 +00:00
parent d26d23348b
commit a6af5b12d2
83 changed files with 20130 additions and 0 deletions

View file

@ -0,0 +1,22 @@
"""
Filter paths found in a resource.
This plugin implements :func:`rp_filter`.
"""
from typing import Optional
def rp_filter(site, durl) -> Optional[str]:
"""
Adjust or filter found paths (may depend on site).
To filter out a path (i.e., not add it to table `site_path`)
return None.
"""
path = durl.pwa()
# skip fetching images (linked from a tags; img tags are skipped anyway)
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
return None
path = path.removesuffix('?amp=1')
return path

View file

@ -0,0 +1,47 @@
"""
Relevance estimation of sites.
This plugin implements :func:`site_filter`.
"""
import re
from atextcrawler.models import Site
MIN_RELEVANCE_SCORE = 5
async def site_filter(site: Site) -> bool:
"""
Assess relevance of the site (using language-dependent criteria).
If the site shall be crawled, return True, else False.
"""
# limit to sites in English or German language
if not set(['de', 'en']) & set(site.langs):
return False
score = 0.0
for crit_name, weight, langs, crit_re in re_criteria:
if '*' in langs or set(langs) & set(site.langs):
findings = crit_re.findall(site.startpage_text)
if findings:
score += weight * len(findings)
if site.title and crit_re.search(site.title):
score += 4 * weight
if site.description and crit_re.search(site.description):
score += 4 * weight
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
return score >= MIN_RELEVANCE_SCORE
re_criteria = {
(
'anarch',
1.0,
('*',),
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
),
('libertär', 0.5, ('de'), re.compile('(libert(är|&auml;r))', re.I)),
}

View file

@ -0,0 +1,24 @@
"""
Plugin for filtering paths of a site to be retrieved.
This plugin implements :func:`sp_filter`.
"""
def sp_filter(site, path, robots) -> bool:
"""
Per-site path filter. Return whether the path shall be retrieved.
"""
if not robots.can_fetch_url(site.base_url + path):
return False
if 'amusewiki' in site.meta_info.get('generator', '').lower():
if any(
[
path.endswith(end)
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
]
):
return False
if '/bbselect?' in path:
return False
return True