Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
0
doc/source/config_template/plugins/__init__.py
Normal file
0
doc/source/config_template/plugins/__init__.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
22
doc/source/config_template/plugins/filter_resource_path.py
Normal file
|
@ -0,0 +1,22 @@
|
|||
"""
|
||||
Filter paths found in a resource.
|
||||
|
||||
This plugin implements :func:`rp_filter`.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def rp_filter(site, durl) -> Optional[str]:
|
||||
"""
|
||||
Adjust or filter found paths (may depend on site).
|
||||
|
||||
To filter out a path (i.e., not add it to table `site_path`)
|
||||
return None.
|
||||
"""
|
||||
path = durl.pwa()
|
||||
# skip fetching images (linked from a tags; img tags are skipped anyway)
|
||||
if path.lower().endswith('.jpg') or path.lower().endswith('.png'):
|
||||
return None
|
||||
path = path.removesuffix('?amp=1')
|
||||
return path
|
47
doc/source/config_template/plugins/filter_site.py
Normal file
47
doc/source/config_template/plugins/filter_site.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
"""
|
||||
Relevance estimation of sites.
|
||||
|
||||
This plugin implements :func:`site_filter`.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from atextcrawler.models import Site
|
||||
|
||||
MIN_RELEVANCE_SCORE = 5
|
||||
|
||||
|
||||
async def site_filter(site: Site) -> bool:
|
||||
"""
|
||||
Assess relevance of the site (using language-dependent criteria).
|
||||
|
||||
If the site shall be crawled, return True, else False.
|
||||
"""
|
||||
# limit to sites in English or German language
|
||||
if not set(['de', 'en']) & set(site.langs):
|
||||
return False
|
||||
score = 0.0
|
||||
for crit_name, weight, langs, crit_re in re_criteria:
|
||||
if '*' in langs or set(langs) & set(site.langs):
|
||||
findings = crit_re.findall(site.startpage_text)
|
||||
if findings:
|
||||
score += weight * len(findings)
|
||||
if site.title and crit_re.search(site.title):
|
||||
score += 4 * weight
|
||||
if site.description and crit_re.search(site.description):
|
||||
score += 4 * weight
|
||||
|
||||
# TODO: add criteria for named entities (FdA-IFA, FAU, ...)
|
||||
|
||||
return score >= MIN_RELEVANCE_SCORE
|
||||
|
||||
|
||||
re_criteria = {
|
||||
(
|
||||
'anarch',
|
||||
1.0,
|
||||
('*',),
|
||||
re.compile('((?<!p)anarch(ie|ism|ist|y|o|a))', re.I),
|
||||
),
|
||||
('libertär', 0.5, ('de'), re.compile('(libert(är|är))', re.I)),
|
||||
}
|
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
24
doc/source/config_template/plugins/filter_site_path.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Plugin for filtering paths of a site to be retrieved.
|
||||
|
||||
This plugin implements :func:`sp_filter`.
|
||||
"""
|
||||
|
||||
|
||||
def sp_filter(site, path, robots) -> bool:
|
||||
"""
|
||||
Per-site path filter. Return whether the path shall be retrieved.
|
||||
"""
|
||||
if not robots.can_fetch_url(site.base_url + path):
|
||||
return False
|
||||
if 'amusewiki' in site.meta_info.get('generator', '').lower():
|
||||
if any(
|
||||
[
|
||||
path.endswith(end)
|
||||
for end in ('.html', '.epub', '.tex', '.zip', '.pdf')
|
||||
]
|
||||
):
|
||||
return False
|
||||
if '/bbselect?' in path:
|
||||
return False
|
||||
return True
|
Loading…
Add table
Add a link
Reference in a new issue