mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Only queue http tasks (temp)
This commit is contained in:
parent
310f343423
commit
06ae89f4d2
@ -309,14 +309,15 @@ class Database:
|
|||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
def get_oldest_updated_websites(self, size: int):
|
def get_oldest_updated_websites(self, size: int, prefix: str):
|
||||||
|
|
||||||
with psycopg2.connect(self.db_conn_str) as conn:
|
with psycopg2.connect(self.db_conn_str) as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("SELECT id, url, last_modified FROM website "
|
cursor.execute("SELECT id, url, last_modified FROM website "
|
||||||
|
"WHERE url LIKE %s "
|
||||||
"ORDER BY last_modified ASC LIMIT %s",
|
"ORDER BY last_modified ASC LIMIT %s",
|
||||||
(size,))
|
(prefix + "%", size, ))
|
||||||
return [Website(url=r[1],
|
return [Website(url=r[1],
|
||||||
website_id=r[0],
|
website_id=r[0],
|
||||||
last_modified=r[2],
|
last_modified=r[2],
|
||||||
|
4
tasks.py
4
tasks.py
@ -133,7 +133,7 @@ class TaskManager:
|
|||||||
def _generate_crawling_tasks(self):
|
def _generate_crawling_tasks(self):
|
||||||
|
|
||||||
# TODO: Insert more in-depth re-crawl logic here
|
# TODO: Insert more in-depth re-crawl logic here
|
||||||
websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE)
|
websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")
|
||||||
|
|
||||||
def recrawl(website: Website):
|
def recrawl(website: Website):
|
||||||
crawl_task = Task(website.id, website.url,
|
crawl_task = Task(website.id, website.url,
|
||||||
@ -141,7 +141,7 @@ class TaskManager:
|
|||||||
)
|
)
|
||||||
self.queue_task(crawl_task)
|
self.queue_task(crawl_task)
|
||||||
|
|
||||||
pool = ThreadPool(processes=10)
|
pool = ThreadPool(processes=3)
|
||||||
pool.map(func=recrawl, iterable=websites_to_crawl)
|
pool.map(func=recrawl, iterable=websites_to_crawl)
|
||||||
pool.close()
|
pool.close()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user