Fix many bugs related to resource storage and resource merging
This commit is contained in:
		
							parent
							
								
									48389a0769
								
							
						
					
					
						commit
						32066ad362
					
				
					 1 changed files with 52 additions and 55 deletions
				
			
		| 
						 | 
				
			
			@ -99,7 +99,7 @@ async def get_site_path(
 | 
			
		|||
    Return the next path of a given site that needs to be processed.
 | 
			
		||||
 | 
			
		||||
    If none needs to be processed, return None.
 | 
			
		||||
    I particular, for sites having crawl_enabled=false return None.
 | 
			
		||||
    In particular, for sites having crawl_enabled=false return None.
 | 
			
		||||
 | 
			
		||||
    Only return paths that have last been visited before *before*
 | 
			
		||||
    or not been processed at all. Paths with an ok_count of -3 or lower
 | 
			
		||||
| 
						 | 
				
			
			@ -200,34 +200,37 @@ async def process_site_path(
 | 
			
		|||
                        site.base_url, res_sitemap.urls
 | 
			
		||||
                    )
 | 
			
		||||
                    await add_site_paths(conn, site.id_, paths)
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    # handle TextResource
 | 
			
		||||
    relevant, is_new_resource = await _handle_text_resource(
 | 
			
		||||
        resource_id = None
 | 
			
		||||
        is_new_resource = False
 | 
			
		||||
    else:  # handle TextResource
 | 
			
		||||
        resource_id, is_new_resource = await _handle_text_resource(
 | 
			
		||||
            app, conn, tf, site, site_path, resource, url
 | 
			
		||||
        )
 | 
			
		||||
    if not relevant:
 | 
			
		||||
        return False
 | 
			
		||||
    site_path.resource_id = resource.id_
 | 
			
		||||
        site_path.canonical = resource.init_fields.get('canonical')
 | 
			
		||||
    site_path.ok_count += 1
 | 
			
		||||
    await site_path.save(conn)
 | 
			
		||||
 | 
			
		||||
        if shortlink_url := resource.init_fields.get('shortlink'):
 | 
			
		||||
            await _save_shortlink(
 | 
			
		||||
            conn, site, url, resource, shortlink_url, site_path.last_visit
 | 
			
		||||
                conn,
 | 
			
		||||
                site,
 | 
			
		||||
                url,
 | 
			
		||||
                resource_id,
 | 
			
		||||
                shortlink_url,
 | 
			
		||||
                site_path.last_visit,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    site_path.resource_id = resource_id
 | 
			
		||||
    site_path.ok_count += 1
 | 
			
		||||
    await site_path.save(conn)
 | 
			
		||||
    return is_new_resource
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def _handle_text_resource(
 | 
			
		||||
    app, conn, tf, site, site_path, resource, url
 | 
			
		||||
) -> tuple[bool, bool]:
 | 
			
		||||
) -> tuple[Optional[int], bool]:
 | 
			
		||||
    """
 | 
			
		||||
    Ingest a text resource.
 | 
			
		||||
    Ingest a text resource returning the id of the possibly merged resource.
 | 
			
		||||
 | 
			
		||||
    Return whether the resource is relevant and whether it is new.
 | 
			
		||||
    Return the id of the merged resource (or None if the incoming resource
 | 
			
		||||
    has a too short text and is not worth storing a resource) and
 | 
			
		||||
    whether the resource is new (False if the returned resource_id is None).
 | 
			
		||||
    """
 | 
			
		||||
    # save the resource's internal links
 | 
			
		||||
    paths = []
 | 
			
		||||
| 
						 | 
				
			
			@ -250,22 +253,18 @@ async def _handle_text_resource(
 | 
			
		|||
            app.config['elasticsearch']['index_base_name'],
 | 
			
		||||
        )
 | 
			
		||||
        await site_path.save(conn)
 | 
			
		||||
        return False, False
 | 
			
		||||
        return None, False
 | 
			
		||||
 | 
			
		||||
    simhash = simhash_from_bigint(resource.simhash)
 | 
			
		||||
    index = site.simhash_index
 | 
			
		||||
    similar_ids = search_simhash(index, simhash)
 | 
			
		||||
    print(similar_ids, site_path.resource_id)
 | 
			
		||||
 | 
			
		||||
    # determine the destination resource and resources to be merged into it
 | 
			
		||||
    old_id = site_path.resource_id
 | 
			
		||||
    if (
 | 
			
		||||
        old_id
 | 
			
		||||
        and old_id in similar_ids
 | 
			
		||||
        and (  # similar to old text
 | 
			
		||||
            dest_resource := await TextResource().load(conn, old_id)
 | 
			
		||||
        )
 | 
			
		||||
    ):
 | 
			
		||||
        merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
 | 
			
		||||
    if old_id and old_id in similar_ids:
 | 
			
		||||
        merge_ids = similar_ids
 | 
			
		||||
        dest_resource = await TextResource().load(conn, old_id)
 | 
			
		||||
    else:  # no old text, or old text not similar any more
 | 
			
		||||
        if old_id:
 | 
			
		||||
            await site_path.unlink_resource(
 | 
			
		||||
| 
						 | 
				
			
			@ -302,7 +301,6 @@ async def _handle_text_resource(
 | 
			
		|||
        create_simhash(index, resource.id_, simhash)
 | 
			
		||||
 | 
			
		||||
    # add resource to search index
 | 
			
		||||
    if resource.content_type in ('html', 'plain'):
 | 
			
		||||
    await index_resource(
 | 
			
		||||
        app.search_engine,
 | 
			
		||||
        tf,
 | 
			
		||||
| 
						 | 
				
			
			@ -312,26 +310,25 @@ async def _handle_text_resource(
 | 
			
		|||
        url,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # merge resources: merge_ids -> resource
 | 
			
		||||
    for merge_id in merge_ids:
 | 
			
		||||
        # replace links to the merge resource with links to the dest resource
 | 
			
		||||
        sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
 | 
			
		||||
        await conn.execute(sql, resource.id_ or None, merge_id)
 | 
			
		||||
        # remove orphaned merge resource
 | 
			
		||||
        sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
 | 
			
		||||
        found = await conn.fetchval(sql, merge_id)
 | 
			
		||||
        if found:
 | 
			
		||||
    # replace references to any merge resource with links to the dest resource
 | 
			
		||||
    sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=ANY($2)"
 | 
			
		||||
    await conn.execute(sql, resource.id_, merge_ids)
 | 
			
		||||
 | 
			
		||||
    # remove orphaned resources after merging
 | 
			
		||||
    sql = "DELETE FROM resource WHERE id=ANY($1) RETURNING (id, lang)"
 | 
			
		||||
    rows = await conn.fetch(sql, set(merge_ids) - set([resource.id_]))
 | 
			
		||||
    for row in rows:
 | 
			
		||||
        await delete_resource(
 | 
			
		||||
            app.search_engine,
 | 
			
		||||
                found[1],
 | 
			
		||||
                merge_id,
 | 
			
		||||
            row['row'][1],
 | 
			
		||||
            row['row'][0],
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    return True, is_new_resource
 | 
			
		||||
    return resource.id_, is_new_resource
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
async def _save_shortlink(
 | 
			
		||||
    conn, site, url, resource, shortlink_url, last_visit
 | 
			
		||||
    conn, site, url, resource_id, shortlink_url, last_visit
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Save a shortlink.
 | 
			
		||||
| 
						 | 
				
			
			@ -349,11 +346,11 @@ async def _save_shortlink(
 | 
			
		|||
                last_visit=last_visit,
 | 
			
		||||
                ok_count=1,
 | 
			
		||||
                canonical=False,
 | 
			
		||||
                resource_id=resource.id_,
 | 
			
		||||
                resource_id=resource_id,
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            shortlink.last_visit = last_visit
 | 
			
		||||
            shortlink.ok_count += 1
 | 
			
		||||
            shortlink.canonical = False
 | 
			
		||||
            shortlink.resource_id = resource.id_
 | 
			
		||||
            shortlink.resource_id = resource_id
 | 
			
		||||
        await shortlink.save(conn)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue