Make minimum text length configurable and actually remove elasticsearch documents

2021-12-26 18:21:15 +00:00 · 2021-12-26 18:21:15 +00:00 · f8debcff87
commit f8debcff87
parent 08ca7fee56
3 changed files with 17 additions and 3 deletions
--- a/doc/source/config_template/main.yaml
+++ b/doc/source/config_template/main.yaml
@ -70,6 +70,12 @@ crawl:
    # Allowed values: positive number
    #feed_crawl_interval: 86400
    # Minimum length of the text (in characters) extracted from
    # a resource; resources with shorter texts are not stored.
    # Default value: 300
    # Allowed values: positive number
    #min_text_length: 300
 # Parameters for access to the ElasticSearch service
 # No default values; must be set.
 elasticsearch:
--- a/src/atextcrawler/config.py
+++ b/src/atextcrawler/config.py
@ -278,6 +278,7 @@ schema_crawl = Schema(
        Required('resource_delay', default=5): positive_number,
        Required('full_crawl_interval', default=864000): positive_number,
        Required('feed_crawl_interval', default=86400): positive_number,
        Required('min_text_length', default=300): positive_number,
    }
 )
--- a/src/atextcrawler/resource/operations.py
+++ b/src/atextcrawler/resource/operations.py
@ -143,7 +143,7 @@ async def process_site_path(
    """
    Fetch a path, deduplicate and if canonical, update and index the resource.
-    Return whether a new resource was handled that should contribute be
+    Return whether a new resource was handled that should contribute to
    statistics.
    """
    msg = (
@ -241,10 +241,17 @@ async def _handle_text_resource(
    # find resources similar to the current text
    text = resource.search_fields['text']
-    if len(text) < 300:  # discard resources with too short texts
+
-        site_path.resource_id = None
+    # discard resources with too short texts
    if len(text) < app.config['crawl']['min_text_length']:
        await site_path.unlink_resource(
            conn,
            app.search_engine,
            app.config['elasticsearch']['index_base_name'],
        )
        await site_path.save(conn)
        return False, False
    simhash = simhash_from_bigint(resource.simhash)
    index = site.simhash_index
    similar_ids = search_simhash(index, simhash)