mirror of
https://github.com/simon987/od-database.git
synced 2025-04-19 18:36:44 +00:00
Update readme
This commit is contained in:
parent
e02d08ca62
commit
5b680be770
20
README.md
20
README.md
@ -36,6 +36,26 @@ RESULTS_PER_PAGE = (25, 50, 100, 250, 500, 1000)
|
|||||||
SUBMIT_FTP = False
|
SUBMIT_FTP = False
|
||||||
# Allow http(s) websites in /submit
|
# Allow http(s) websites in /submit
|
||||||
SUBMIT_HTTP = True
|
SUBMIT_HTTP = True
|
||||||
|
|
||||||
|
# Number of re-crawl tasks to keep in the queue
|
||||||
|
RECRAWL_POOL_SIZE = 10000
|
||||||
|
# task_tracker API url
|
||||||
|
TT_API = "http://localhost:3010"
|
||||||
|
# task_tracker crawl project id
|
||||||
|
TT_CRAWL_PROJECT = 3
|
||||||
|
# task_tracker indexing project id
|
||||||
|
TT_INDEX_PROJECT = 9
|
||||||
|
# Number of threads to use for ES indexing
|
||||||
|
INDEXER_THREADS = 4
|
||||||
|
|
||||||
|
# ws_bucket API url
|
||||||
|
WSB_API = "http://localhost:3020"
|
||||||
|
# ws_bucket secret
|
||||||
|
WSB_SECRET = "default_secret"
|
||||||
|
# ws_bucket data directory
|
||||||
|
WSB_PATH = "/mnt/data/github.com/simon987/ws_bucket/data"
|
||||||
|
# od-database PostgreSQL connection string
|
||||||
|
DB_CONN_STR = "dbname=od-database user=od-database password=xxx"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Running the crawl server
|
## Running the crawl server
|
||||||
|
4
tasks.py
4
tasks.py
@ -77,10 +77,12 @@ class TaskManager:
|
|||||||
logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
|
logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
|
||||||
for _ in range(config.INDEXER_THREADS):
|
for _ in range(config.INDEXER_THREADS):
|
||||||
t = Thread(target=self._do_indexing)
|
t = Thread(target=self._do_indexing)
|
||||||
|
t.setDaemon(True)
|
||||||
self._indexer_threads.append(t)
|
self._indexer_threads.append(t)
|
||||||
t.start()
|
t.start()
|
||||||
|
|
||||||
self._recrawl_thread = Thread(target=self._do_recrawl)
|
self._recrawl_thread = Thread(target=self._do_recrawl)
|
||||||
|
self._recrawl_thread.setDaemon(True)
|
||||||
self._recrawl_thread.start()
|
self._recrawl_thread.start()
|
||||||
|
|
||||||
def _do_indexing(self):
|
def _do_indexing(self):
|
||||||
@ -131,7 +133,7 @@ class TaskManager:
|
|||||||
def _generate_crawling_tasks(self):
|
def _generate_crawling_tasks(self):
|
||||||
|
|
||||||
# TODO: Insert more in-depth re-crawl logic here
|
# TODO: Insert more in-depth re-crawl logic here
|
||||||
websites_to_crawl = self.db.get_oldest_updated_websites(10000)
|
websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE)
|
||||||
|
|
||||||
def recrawl(website: Website):
|
def recrawl(website: Website):
|
||||||
crawl_task = Task(website.id, website.url,
|
crawl_task = Task(website.id, website.url,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user