Make minimum text length configurable and actually remove elasticsearch documents

This commit is contained in:
ibu 2021-12-26 18:21:15 +00:00
parent 08ca7fee56
commit f8debcff87
3 changed files with 17 additions and 3 deletions

View file

@ -70,6 +70,12 @@ crawl:
# Allowed values: positive number # Allowed values: positive number
#feed_crawl_interval: 86400 #feed_crawl_interval: 86400
# Minimum length of the text (in characters) extracted from
# a resource; resources with shorter texts are not stored.
# Default value: 300
# Allowed values: positive number
#min_text_length: 300
# Parameters for access to the ElasticSearch service # Parameters for access to the ElasticSearch service
# No default values; must be set. # No default values; must be set.
elasticsearch: elasticsearch:

View file

@ -278,6 +278,7 @@ schema_crawl = Schema(
Required('resource_delay', default=5): positive_number, Required('resource_delay', default=5): positive_number,
Required('full_crawl_interval', default=864000): positive_number, Required('full_crawl_interval', default=864000): positive_number,
Required('feed_crawl_interval', default=86400): positive_number, Required('feed_crawl_interval', default=86400): positive_number,
Required('min_text_length', default=300): positive_number,
} }
) )

View file

@ -143,7 +143,7 @@ async def process_site_path(
""" """
Fetch a path, deduplicate and if canonical, update and index the resource. Fetch a path, deduplicate and if canonical, update and index the resource.
Return whether a new resource was handled that should contribute be Return whether a new resource was handled that should contribute to
statistics. statistics.
""" """
msg = ( msg = (
@ -241,10 +241,17 @@ async def _handle_text_resource(
# find resources similar to the current text # find resources similar to the current text
text = resource.search_fields['text'] text = resource.search_fields['text']
if len(text) < 300: # discard resources with too short texts
site_path.resource_id = None # discard resources with too short texts
if len(text) < app.config['crawl']['min_text_length']:
await site_path.unlink_resource(
conn,
app.search_engine,
app.config['elasticsearch']['index_base_name'],
)
await site_path.save(conn) await site_path.save(conn)
return False, False return False, False
simhash = simhash_from_bigint(resource.simhash) simhash = simhash_from_bigint(resource.simhash)
index = site.simhash_index index = site.simhash_index
similar_ids = search_simhash(index, simhash) similar_ids = search_simhash(index, simhash)