Fix many bugs related to resource storage and resource merging

This commit is contained in:
ibu 2022-01-09 16:47:02 +00:00
parent 48389a0769
commit 32066ad362

View file

@ -99,7 +99,7 @@ async def get_site_path(
Return the next path of a given site that needs to be processed.
If none needs to be processed, return None.
I particular, for sites having crawl_enabled=false return None.
In particular, for sites having crawl_enabled=false return None.
Only return paths that have last been visited before *before*
or not been processed at all. Paths with an ok_count of -3 or lower
@ -200,34 +200,37 @@ async def process_site_path(
site.base_url, res_sitemap.urls
)
await add_site_paths(conn, site.id_, paths)
return False
# handle TextResource
relevant, is_new_resource = await _handle_text_resource(
resource_id = None
is_new_resource = False
else: # handle TextResource
resource_id, is_new_resource = await _handle_text_resource(
app, conn, tf, site, site_path, resource, url
)
if not relevant:
return False
site_path.resource_id = resource.id_
site_path.canonical = resource.init_fields.get('canonical')
site_path.ok_count += 1
await site_path.save(conn)
if shortlink_url := resource.init_fields.get('shortlink'):
await _save_shortlink(
conn, site, url, resource, shortlink_url, site_path.last_visit
conn,
site,
url,
resource_id,
shortlink_url,
site_path.last_visit,
)
site_path.resource_id = resource_id
site_path.ok_count += 1
await site_path.save(conn)
return is_new_resource
async def _handle_text_resource(
app, conn, tf, site, site_path, resource, url
) -> tuple[bool, bool]:
) -> tuple[Optional[int], bool]:
"""
Ingest a text resource.
Ingest a text resource returning the id of the possibly merged resource.
Return whether the resource is relevant and whether it is new.
Return the id of the merged resource (or None if the incoming resource
has a too short text and is not worth storing a resource) and
whether the resource is new (False if the returned resource_id is None).
"""
# save the resource's internal links
paths = []
@ -250,22 +253,18 @@ async def _handle_text_resource(
app.config['elasticsearch']['index_base_name'],
)
await site_path.save(conn)
return False, False
return None, False
simhash = simhash_from_bigint(resource.simhash)
index = site.simhash_index
similar_ids = search_simhash(index, simhash)
print(similar_ids, site_path.resource_id)
# determine the destination resource and resources to be merged into it
old_id = site_path.resource_id
if (
old_id
and old_id in similar_ids
and ( # similar to old text
dest_resource := await TextResource().load(conn, old_id)
)
):
merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
if old_id and old_id in similar_ids:
merge_ids = similar_ids
dest_resource = await TextResource().load(conn, old_id)
else: # no old text, or old text not similar any more
if old_id:
await site_path.unlink_resource(
@ -302,7 +301,6 @@ async def _handle_text_resource(
create_simhash(index, resource.id_, simhash)
# add resource to search index
if resource.content_type in ('html', 'plain'):
await index_resource(
app.search_engine,
tf,
@ -312,26 +310,25 @@ async def _handle_text_resource(
url,
)
# merge resources: merge_ids -> resource
for merge_id in merge_ids:
# replace links to the merge resource with links to the dest resource
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
await conn.execute(sql, resource.id_ or None, merge_id)
# remove orphaned merge resource
sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
found = await conn.fetchval(sql, merge_id)
if found:
# replace references to any merge resource with links to the dest resource
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=ANY($2)"
await conn.execute(sql, resource.id_, merge_ids)
# remove orphaned resources after merging
sql = "DELETE FROM resource WHERE id=ANY($1) RETURNING (id, lang)"
rows = await conn.fetch(sql, set(merge_ids) - set([resource.id_]))
for row in rows:
await delete_resource(
app.search_engine,
found[1],
merge_id,
row['row'][1],
row['row'][0],
)
return True, is_new_resource
return resource.id_, is_new_resource
async def _save_shortlink(
conn, site, url, resource, shortlink_url, last_visit
conn, site, url, resource_id, shortlink_url, last_visit
):
"""
Save a shortlink.
@ -349,11 +346,11 @@ async def _save_shortlink(
last_visit=last_visit,
ok_count=1,
canonical=False,
resource_id=resource.id_,
resource_id=resource_id,
)
else:
shortlink.last_visit = last_visit
shortlink.ok_count += 1
shortlink.canonical = False
shortlink.resource_id = resource.id_
shortlink.resource_id = resource_id
await shortlink.save(conn)