Fix many bugs related to resource storage and resource merging
This commit is contained in:
		
							parent
							
								
									48389a0769
								
							
						
					
					
						commit
						32066ad362
					
				
					 1 changed files with 52 additions and 55 deletions
				
			
		| 
						 | 
					@ -99,7 +99,7 @@ async def get_site_path(
 | 
				
			||||||
    Return the next path of a given site that needs to be processed.
 | 
					    Return the next path of a given site that needs to be processed.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    If none needs to be processed, return None.
 | 
					    If none needs to be processed, return None.
 | 
				
			||||||
    I particular, for sites having crawl_enabled=false return None.
 | 
					    In particular, for sites having crawl_enabled=false return None.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Only return paths that have last been visited before *before*
 | 
					    Only return paths that have last been visited before *before*
 | 
				
			||||||
    or not been processed at all. Paths with an ok_count of -3 or lower
 | 
					    or not been processed at all. Paths with an ok_count of -3 or lower
 | 
				
			||||||
| 
						 | 
					@ -200,34 +200,37 @@ async def process_site_path(
 | 
				
			||||||
                        site.base_url, res_sitemap.urls
 | 
					                        site.base_url, res_sitemap.urls
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                    await add_site_paths(conn, site.id_, paths)
 | 
					                    await add_site_paths(conn, site.id_, paths)
 | 
				
			||||||
        return False
 | 
					        resource_id = None
 | 
				
			||||||
 | 
					        is_new_resource = False
 | 
				
			||||||
    # handle TextResource
 | 
					    else:  # handle TextResource
 | 
				
			||||||
    relevant, is_new_resource = await _handle_text_resource(
 | 
					        resource_id, is_new_resource = await _handle_text_resource(
 | 
				
			||||||
        app, conn, tf, site, site_path, resource, url
 | 
					            app, conn, tf, site, site_path, resource, url
 | 
				
			||||||
    )
 | 
					        )
 | 
				
			||||||
    if not relevant:
 | 
					        site_path.canonical = resource.init_fields.get('canonical')
 | 
				
			||||||
        return False
 | 
					        if shortlink_url := resource.init_fields.get('shortlink'):
 | 
				
			||||||
    site_path.resource_id = resource.id_
 | 
					            await _save_shortlink(
 | 
				
			||||||
    site_path.canonical = resource.init_fields.get('canonical')
 | 
					                conn,
 | 
				
			||||||
 | 
					                site,
 | 
				
			||||||
 | 
					                url,
 | 
				
			||||||
 | 
					                resource_id,
 | 
				
			||||||
 | 
					                shortlink_url,
 | 
				
			||||||
 | 
					                site_path.last_visit,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    site_path.resource_id = resource_id
 | 
				
			||||||
    site_path.ok_count += 1
 | 
					    site_path.ok_count += 1
 | 
				
			||||||
    await site_path.save(conn)
 | 
					    await site_path.save(conn)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    if shortlink_url := resource.init_fields.get('shortlink'):
 | 
					 | 
				
			||||||
        await _save_shortlink(
 | 
					 | 
				
			||||||
            conn, site, url, resource, shortlink_url, site_path.last_visit
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return is_new_resource
 | 
					    return is_new_resource
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
async def _handle_text_resource(
 | 
					async def _handle_text_resource(
 | 
				
			||||||
    app, conn, tf, site, site_path, resource, url
 | 
					    app, conn, tf, site, site_path, resource, url
 | 
				
			||||||
) -> tuple[bool, bool]:
 | 
					) -> tuple[Optional[int], bool]:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Ingest a text resource.
 | 
					    Ingest a text resource returning the id of the possibly merged resource.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Return whether the resource is relevant and whether it is new.
 | 
					    Return the id of the merged resource (or None if the incoming resource
 | 
				
			||||||
 | 
					    has a too short text and is not worth storing a resource) and
 | 
				
			||||||
 | 
					    whether the resource is new (False if the returned resource_id is None).
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    # save the resource's internal links
 | 
					    # save the resource's internal links
 | 
				
			||||||
    paths = []
 | 
					    paths = []
 | 
				
			||||||
| 
						 | 
					@ -250,22 +253,18 @@ async def _handle_text_resource(
 | 
				
			||||||
            app.config['elasticsearch']['index_base_name'],
 | 
					            app.config['elasticsearch']['index_base_name'],
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        await site_path.save(conn)
 | 
					        await site_path.save(conn)
 | 
				
			||||||
        return False, False
 | 
					        return None, False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    simhash = simhash_from_bigint(resource.simhash)
 | 
					    simhash = simhash_from_bigint(resource.simhash)
 | 
				
			||||||
    index = site.simhash_index
 | 
					    index = site.simhash_index
 | 
				
			||||||
    similar_ids = search_simhash(index, simhash)
 | 
					    similar_ids = search_simhash(index, simhash)
 | 
				
			||||||
 | 
					    print(similar_ids, site_path.resource_id)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # determine the destination resource and resources to be merged into it
 | 
					    # determine the destination resource and resources to be merged into it
 | 
				
			||||||
    old_id = site_path.resource_id
 | 
					    old_id = site_path.resource_id
 | 
				
			||||||
    if (
 | 
					    if old_id and old_id in similar_ids:
 | 
				
			||||||
        old_id
 | 
					        merge_ids = similar_ids
 | 
				
			||||||
        and old_id in similar_ids
 | 
					        dest_resource = await TextResource().load(conn, old_id)
 | 
				
			||||||
        and (  # similar to old text
 | 
					 | 
				
			||||||
            dest_resource := await TextResource().load(conn, old_id)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    ):
 | 
					 | 
				
			||||||
        merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
 | 
					 | 
				
			||||||
    else:  # no old text, or old text not similar any more
 | 
					    else:  # no old text, or old text not similar any more
 | 
				
			||||||
        if old_id:
 | 
					        if old_id:
 | 
				
			||||||
            await site_path.unlink_resource(
 | 
					            await site_path.unlink_resource(
 | 
				
			||||||
| 
						 | 
					@ -302,36 +301,34 @@ async def _handle_text_resource(
 | 
				
			||||||
        create_simhash(index, resource.id_, simhash)
 | 
					        create_simhash(index, resource.id_, simhash)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # add resource to search index
 | 
					    # add resource to search index
 | 
				
			||||||
    if resource.content_type in ('html', 'plain'):
 | 
					    await index_resource(
 | 
				
			||||||
        await index_resource(
 | 
					        app.search_engine,
 | 
				
			||||||
 | 
					        tf,
 | 
				
			||||||
 | 
					        site_path,
 | 
				
			||||||
 | 
					        resource,
 | 
				
			||||||
 | 
					        site.base_url,
 | 
				
			||||||
 | 
					        url,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # replace references to any merge resource with links to the dest resource
 | 
				
			||||||
 | 
					    sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=ANY($2)"
 | 
				
			||||||
 | 
					    await conn.execute(sql, resource.id_, merge_ids)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # remove orphaned resources after merging
 | 
				
			||||||
 | 
					    sql = "DELETE FROM resource WHERE id=ANY($1) RETURNING (id, lang)"
 | 
				
			||||||
 | 
					    rows = await conn.fetch(sql, set(merge_ids) - set([resource.id_]))
 | 
				
			||||||
 | 
					    for row in rows:
 | 
				
			||||||
 | 
					        await delete_resource(
 | 
				
			||||||
            app.search_engine,
 | 
					            app.search_engine,
 | 
				
			||||||
            tf,
 | 
					            row['row'][1],
 | 
				
			||||||
            site_path,
 | 
					            row['row'][0],
 | 
				
			||||||
            resource,
 | 
					 | 
				
			||||||
            site.base_url,
 | 
					 | 
				
			||||||
            url,
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # merge resources: merge_ids -> resource
 | 
					    return resource.id_, is_new_resource
 | 
				
			||||||
    for merge_id in merge_ids:
 | 
					 | 
				
			||||||
        # replace links to the merge resource with links to the dest resource
 | 
					 | 
				
			||||||
        sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
 | 
					 | 
				
			||||||
        await conn.execute(sql, resource.id_ or None, merge_id)
 | 
					 | 
				
			||||||
        # remove orphaned merge resource
 | 
					 | 
				
			||||||
        sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
 | 
					 | 
				
			||||||
        found = await conn.fetchval(sql, merge_id)
 | 
					 | 
				
			||||||
        if found:
 | 
					 | 
				
			||||||
            await delete_resource(
 | 
					 | 
				
			||||||
                app.search_engine,
 | 
					 | 
				
			||||||
                found[1],
 | 
					 | 
				
			||||||
                merge_id,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return True, is_new_resource
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
async def _save_shortlink(
 | 
					async def _save_shortlink(
 | 
				
			||||||
    conn, site, url, resource, shortlink_url, last_visit
 | 
					    conn, site, url, resource_id, shortlink_url, last_visit
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Save a shortlink.
 | 
					    Save a shortlink.
 | 
				
			||||||
| 
						 | 
					@ -349,11 +346,11 @@ async def _save_shortlink(
 | 
				
			||||||
                last_visit=last_visit,
 | 
					                last_visit=last_visit,
 | 
				
			||||||
                ok_count=1,
 | 
					                ok_count=1,
 | 
				
			||||||
                canonical=False,
 | 
					                canonical=False,
 | 
				
			||||||
                resource_id=resource.id_,
 | 
					                resource_id=resource_id,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            shortlink.last_visit = last_visit
 | 
					            shortlink.last_visit = last_visit
 | 
				
			||||||
            shortlink.ok_count += 1
 | 
					            shortlink.ok_count += 1
 | 
				
			||||||
            shortlink.canonical = False
 | 
					            shortlink.canonical = False
 | 
				
			||||||
            shortlink.resource_id = resource.id_
 | 
					            shortlink.resource_id = resource_id
 | 
				
			||||||
        await shortlink.save(conn)
 | 
					        await shortlink.save(conn)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue