Make minimum text length configurable and actually remove elasticsearch documents
This commit is contained in:
parent
08ca7fee56
commit
f8debcff87
3 changed files with 17 additions and 3 deletions
|
@ -70,6 +70,12 @@ crawl:
|
||||||
# Allowed values: positive number
|
# Allowed values: positive number
|
||||||
#feed_crawl_interval: 86400
|
#feed_crawl_interval: 86400
|
||||||
|
|
||||||
|
# Minimum length of the text (in characters) extracted from
|
||||||
|
# a resource; resources with shorter texts are not stored.
|
||||||
|
# Default value: 300
|
||||||
|
# Allowed values: positive number
|
||||||
|
#min_text_length: 300
|
||||||
|
|
||||||
# Parameters for access to the ElasticSearch service
|
# Parameters for access to the ElasticSearch service
|
||||||
# No default values; must be set.
|
# No default values; must be set.
|
||||||
elasticsearch:
|
elasticsearch:
|
||||||
|
|
|
@ -278,6 +278,7 @@ schema_crawl = Schema(
|
||||||
Required('resource_delay', default=5): positive_number,
|
Required('resource_delay', default=5): positive_number,
|
||||||
Required('full_crawl_interval', default=864000): positive_number,
|
Required('full_crawl_interval', default=864000): positive_number,
|
||||||
Required('feed_crawl_interval', default=86400): positive_number,
|
Required('feed_crawl_interval', default=86400): positive_number,
|
||||||
|
Required('min_text_length', default=300): positive_number,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -143,7 +143,7 @@ async def process_site_path(
|
||||||
"""
|
"""
|
||||||
Fetch a path, deduplicate and if canonical, update and index the resource.
|
Fetch a path, deduplicate and if canonical, update and index the resource.
|
||||||
|
|
||||||
Return whether a new resource was handled that should contribute be
|
Return whether a new resource was handled that should contribute to
|
||||||
statistics.
|
statistics.
|
||||||
"""
|
"""
|
||||||
msg = (
|
msg = (
|
||||||
|
@ -241,10 +241,17 @@ async def _handle_text_resource(
|
||||||
|
|
||||||
# find resources similar to the current text
|
# find resources similar to the current text
|
||||||
text = resource.search_fields['text']
|
text = resource.search_fields['text']
|
||||||
if len(text) < 300: # discard resources with too short texts
|
|
||||||
site_path.resource_id = None
|
# discard resources with too short texts
|
||||||
|
if len(text) < app.config['crawl']['min_text_length']:
|
||||||
|
await site_path.unlink_resource(
|
||||||
|
conn,
|
||||||
|
app.search_engine,
|
||||||
|
app.config['elasticsearch']['index_base_name'],
|
||||||
|
)
|
||||||
await site_path.save(conn)
|
await site_path.save(conn)
|
||||||
return False, False
|
return False, False
|
||||||
|
|
||||||
simhash = simhash_from_bigint(resource.simhash)
|
simhash = simhash_from_bigint(resource.simhash)
|
||||||
index = site.simhash_index
|
index = site.simhash_index
|
||||||
similar_ids = search_simhash(index, simhash)
|
similar_ids = search_simhash(index, simhash)
|
||||||
|
|
Loading…
Reference in a new issue