mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-16 08:46:49 +00:00
39 lines
966 B
Python
39 lines
966 B
Python
import os
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, enabled_parsers: list):
|
|
self.documents = []
|
|
self.enabled_parsers = enabled_parsers
|
|
|
|
for parser in self.enabled_parsers:
|
|
if parser.is_default:
|
|
self.default_parser = parser
|
|
|
|
self.ext_map = {}
|
|
|
|
for parser in self.enabled_parsers:
|
|
for ext in parser.extensions:
|
|
self.ext_map[ext] = parser
|
|
|
|
def crawl(self, root_dir: str):
|
|
for root, dirs, files in os.walk(root_dir):
|
|
|
|
for filename in files:
|
|
full_path = os.path.join(root, filename)
|
|
|
|
parser = self.ext_map.get(os.path.splitext(filename)[1], self.default_parser)
|
|
|
|
doc = parser.parse(full_path)
|
|
|
|
self.documents.append(doc)
|
|
|
|
def countFiles(self, root_dir: str):
|
|
count = 0
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
count += len(files)
|
|
|
|
return count
|
|
|