From 8246ce62516696023a5372dba6094a51246a469b Mon Sep 17 00:00:00 2001 From: ibu Date: Wed, 8 Dec 2021 12:41:34 +0000 Subject: [PATCH] Only return a SitePath if the site has crawl_enabled=true --- src/atextcrawler/resource/operations.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/atextcrawler/resource/operations.py b/src/atextcrawler/resource/operations.py index dffe2bc..078a668 100644 --- a/src/atextcrawler/resource/operations.py +++ b/src/atextcrawler/resource/operations.py @@ -99,14 +99,19 @@ async def get_site_path( Return the next path of a given site that needs to be processed. If none needs to be processed, return None. + I particular, for sites having crawl_enabled=false return None. Only return paths that have last been visited before *before* - or not been processed at all. Paths with a ok_count of -3 or lower + or not been processed at all. Paths with an ok_count of -3 or lower are dropped. If *only_new*, limit to paths that have not been processed at all, irrespective of the value of *before*. """ + sql = "SELECT crawl_enabled FROM site WHERE id=$1" + crawl_enabled = await conn.fetchval(sql, site.id_) + if not crawl_enabled: + return None if only_new: sql = ( "SELECT * FROM site_path"