Fix many bugs related to resource storage and resource merging
This commit is contained in:
parent
48389a0769
commit
32066ad362
1 changed files with 52 additions and 55 deletions
|
@ -99,7 +99,7 @@ async def get_site_path(
|
||||||
Return the next path of a given site that needs to be processed.
|
Return the next path of a given site that needs to be processed.
|
||||||
|
|
||||||
If none needs to be processed, return None.
|
If none needs to be processed, return None.
|
||||||
I particular, for sites having crawl_enabled=false return None.
|
In particular, for sites having crawl_enabled=false return None.
|
||||||
|
|
||||||
Only return paths that have last been visited before *before*
|
Only return paths that have last been visited before *before*
|
||||||
or not been processed at all. Paths with an ok_count of -3 or lower
|
or not been processed at all. Paths with an ok_count of -3 or lower
|
||||||
|
@ -200,34 +200,37 @@ async def process_site_path(
|
||||||
site.base_url, res_sitemap.urls
|
site.base_url, res_sitemap.urls
|
||||||
)
|
)
|
||||||
await add_site_paths(conn, site.id_, paths)
|
await add_site_paths(conn, site.id_, paths)
|
||||||
return False
|
resource_id = None
|
||||||
|
is_new_resource = False
|
||||||
# handle TextResource
|
else: # handle TextResource
|
||||||
relevant, is_new_resource = await _handle_text_resource(
|
resource_id, is_new_resource = await _handle_text_resource(
|
||||||
app, conn, tf, site, site_path, resource, url
|
app, conn, tf, site, site_path, resource, url
|
||||||
)
|
)
|
||||||
if not relevant:
|
|
||||||
return False
|
|
||||||
site_path.resource_id = resource.id_
|
|
||||||
site_path.canonical = resource.init_fields.get('canonical')
|
site_path.canonical = resource.init_fields.get('canonical')
|
||||||
site_path.ok_count += 1
|
|
||||||
await site_path.save(conn)
|
|
||||||
|
|
||||||
if shortlink_url := resource.init_fields.get('shortlink'):
|
if shortlink_url := resource.init_fields.get('shortlink'):
|
||||||
await _save_shortlink(
|
await _save_shortlink(
|
||||||
conn, site, url, resource, shortlink_url, site_path.last_visit
|
conn,
|
||||||
|
site,
|
||||||
|
url,
|
||||||
|
resource_id,
|
||||||
|
shortlink_url,
|
||||||
|
site_path.last_visit,
|
||||||
)
|
)
|
||||||
|
site_path.resource_id = resource_id
|
||||||
|
site_path.ok_count += 1
|
||||||
|
await site_path.save(conn)
|
||||||
return is_new_resource
|
return is_new_resource
|
||||||
|
|
||||||
|
|
||||||
async def _handle_text_resource(
|
async def _handle_text_resource(
|
||||||
app, conn, tf, site, site_path, resource, url
|
app, conn, tf, site, site_path, resource, url
|
||||||
) -> tuple[bool, bool]:
|
) -> tuple[Optional[int], bool]:
|
||||||
"""
|
"""
|
||||||
Ingest a text resource.
|
Ingest a text resource returning the id of the possibly merged resource.
|
||||||
|
|
||||||
Return whether the resource is relevant and whether it is new.
|
Return the id of the merged resource (or None if the incoming resource
|
||||||
|
has a too short text and is not worth storing a resource) and
|
||||||
|
whether the resource is new (False if the returned resource_id is None).
|
||||||
"""
|
"""
|
||||||
# save the resource's internal links
|
# save the resource's internal links
|
||||||
paths = []
|
paths = []
|
||||||
|
@ -250,22 +253,18 @@ async def _handle_text_resource(
|
||||||
app.config['elasticsearch']['index_base_name'],
|
app.config['elasticsearch']['index_base_name'],
|
||||||
)
|
)
|
||||||
await site_path.save(conn)
|
await site_path.save(conn)
|
||||||
return False, False
|
return None, False
|
||||||
|
|
||||||
simhash = simhash_from_bigint(resource.simhash)
|
simhash = simhash_from_bigint(resource.simhash)
|
||||||
index = site.simhash_index
|
index = site.simhash_index
|
||||||
similar_ids = search_simhash(index, simhash)
|
similar_ids = search_simhash(index, simhash)
|
||||||
|
print(similar_ids, site_path.resource_id)
|
||||||
|
|
||||||
# determine the destination resource and resources to be merged into it
|
# determine the destination resource and resources to be merged into it
|
||||||
old_id = site_path.resource_id
|
old_id = site_path.resource_id
|
||||||
if (
|
if old_id and old_id in similar_ids:
|
||||||
old_id
|
merge_ids = similar_ids
|
||||||
and old_id in similar_ids
|
dest_resource = await TextResource().load(conn, old_id)
|
||||||
and ( # similar to old text
|
|
||||||
dest_resource := await TextResource().load(conn, old_id)
|
|
||||||
)
|
|
||||||
):
|
|
||||||
merge_ids = list(filter(lambda elem: elem != old_id, similar_ids))
|
|
||||||
else: # no old text, or old text not similar any more
|
else: # no old text, or old text not similar any more
|
||||||
if old_id:
|
if old_id:
|
||||||
await site_path.unlink_resource(
|
await site_path.unlink_resource(
|
||||||
|
@ -302,7 +301,6 @@ async def _handle_text_resource(
|
||||||
create_simhash(index, resource.id_, simhash)
|
create_simhash(index, resource.id_, simhash)
|
||||||
|
|
||||||
# add resource to search index
|
# add resource to search index
|
||||||
if resource.content_type in ('html', 'plain'):
|
|
||||||
await index_resource(
|
await index_resource(
|
||||||
app.search_engine,
|
app.search_engine,
|
||||||
tf,
|
tf,
|
||||||
|
@ -312,26 +310,25 @@ async def _handle_text_resource(
|
||||||
url,
|
url,
|
||||||
)
|
)
|
||||||
|
|
||||||
# merge resources: merge_ids -> resource
|
# replace references to any merge resource with links to the dest resource
|
||||||
for merge_id in merge_ids:
|
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=ANY($2)"
|
||||||
# replace links to the merge resource with links to the dest resource
|
await conn.execute(sql, resource.id_, merge_ids)
|
||||||
sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2"
|
|
||||||
await conn.execute(sql, resource.id_ or None, merge_id)
|
# remove orphaned resources after merging
|
||||||
# remove orphaned merge resource
|
sql = "DELETE FROM resource WHERE id=ANY($1) RETURNING (id, lang)"
|
||||||
sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)"
|
rows = await conn.fetch(sql, set(merge_ids) - set([resource.id_]))
|
||||||
found = await conn.fetchval(sql, merge_id)
|
for row in rows:
|
||||||
if found:
|
|
||||||
await delete_resource(
|
await delete_resource(
|
||||||
app.search_engine,
|
app.search_engine,
|
||||||
found[1],
|
row['row'][1],
|
||||||
merge_id,
|
row['row'][0],
|
||||||
)
|
)
|
||||||
|
|
||||||
return True, is_new_resource
|
return resource.id_, is_new_resource
|
||||||
|
|
||||||
|
|
||||||
async def _save_shortlink(
|
async def _save_shortlink(
|
||||||
conn, site, url, resource, shortlink_url, last_visit
|
conn, site, url, resource_id, shortlink_url, last_visit
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Save a shortlink.
|
Save a shortlink.
|
||||||
|
@ -349,11 +346,11 @@ async def _save_shortlink(
|
||||||
last_visit=last_visit,
|
last_visit=last_visit,
|
||||||
ok_count=1,
|
ok_count=1,
|
||||||
canonical=False,
|
canonical=False,
|
||||||
resource_id=resource.id_,
|
resource_id=resource_id,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
shortlink.last_visit = last_visit
|
shortlink.last_visit = last_visit
|
||||||
shortlink.ok_count += 1
|
shortlink.ok_count += 1
|
||||||
shortlink.canonical = False
|
shortlink.canonical = False
|
||||||
shortlink.resource_id = resource.id_
|
shortlink.resource_id = resource_id
|
||||||
await shortlink.save(conn)
|
await shortlink.save(conn)
|
||||||
|
|
Loading…
Reference in a new issue