mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-12-15 16:19:05 +00:00
Added web interface, crawler and more work on local storage
This commit is contained in:
173
crawler.py
173
crawler.py
@@ -1,152 +1,33 @@
|
||||
import os
|
||||
import hashlib
|
||||
|
||||
|
||||
class Crawler:
|
||||
pass
|
||||
|
||||
def __init__(self, enabled_parsers: list):
|
||||
self.documents = []
|
||||
self.enabled_parsers = enabled_parsers
|
||||
|
||||
def crawl(self, root_dir: str):
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
|
||||
for filename in files:
|
||||
full_path = os.path.join(root, filename)
|
||||
|
||||
parser = self.get_parser_by_ext(os.path.splitext(filename)[1])
|
||||
|
||||
doc = parser.parse(full_path)
|
||||
|
||||
self.documents.append(doc)
|
||||
|
||||
def get_parser_by_ext(self, ext: str):
|
||||
|
||||
for parser in self.enabled_parsers:
|
||||
|
||||
if ext in parser.extensions:
|
||||
return parser
|
||||
|
||||
for parser in self.enabled_parsers:
|
||||
if parser.is_default:
|
||||
return parser
|
||||
|
||||
|
||||
class FileParser:
|
||||
pass
|
||||
|
||||
|
||||
class FileCheckSumCalculator:
|
||||
|
||||
def checksum(self, path: str) -> str:
|
||||
"""
|
||||
Calculate the checksum of a file
|
||||
:param path: path of the file
|
||||
:return: checksum
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class Md5CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
def __init__(self):
|
||||
self.name = "md5"
|
||||
|
||||
def checksum(self, path: str) -> str:
|
||||
"""
|
||||
Calculate the md5 checksum of a file
|
||||
:param path: path of the file
|
||||
:return: md5 checksum
|
||||
"""
|
||||
result = hashlib.md5()
|
||||
|
||||
with open(path, "rb") as f:
|
||||
for block in iter(lambda: f.read(65536), b""):
|
||||
result.update(block)
|
||||
|
||||
return result.hexdigest().upper()
|
||||
|
||||
|
||||
class Sha1CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
def __init__(self):
|
||||
self.name = "sha1"
|
||||
|
||||
def checksum(self, path: str) -> str:
|
||||
"""
|
||||
Calculate the sha1 checksum of a file
|
||||
:param path: path of the file
|
||||
:return: sha1 checksum
|
||||
"""
|
||||
result = hashlib.sha1()
|
||||
|
||||
with open(path, "rb") as f:
|
||||
for block in iter(lambda: f.read(65536), b""):
|
||||
result.update(block)
|
||||
|
||||
return result.hexdigest().upper()
|
||||
|
||||
|
||||
class Sha256CheckSumCalculator(FileCheckSumCalculator):
|
||||
|
||||
def __init__(self):
|
||||
self.name = "sha256"
|
||||
|
||||
def checksum(self, path: str) -> str:
|
||||
"""
|
||||
Calculate the sha256 checksum of a file
|
||||
:param path: path of the file
|
||||
:return: sha256 checksum
|
||||
"""
|
||||
result = hashlib.sha256()
|
||||
|
||||
with open(path, "rb") as f:
|
||||
for block in iter(lambda: f.read(65536), b""):
|
||||
result.update(block)
|
||||
|
||||
return result.hexdigest().upper()
|
||||
|
||||
|
||||
class GenericFileParser(FileParser):
|
||||
|
||||
def __init__(self, checksum_calculators: list):
|
||||
self.checksum_calculators = checksum_calculators
|
||||
|
||||
def parse(self, path: str) -> dict:
|
||||
"""
|
||||
Parse a generic file
|
||||
:param path: path of the file to parse
|
||||
:return: dict information about the file
|
||||
"""
|
||||
|
||||
info = dict()
|
||||
|
||||
info["size"] = os.path.getsize(path)
|
||||
info["name"] = os.path.splitext(path)[0]
|
||||
|
||||
for calculator in self.checksum_calculators:
|
||||
info[calculator.name] = calculator.checksum(path)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
|
||||
|
||||
# def crawl(root_dir: str) -> None:
|
||||
# docs = []
|
||||
#
|
||||
# for root, dirs, files in os.walk(root_dir):
|
||||
#
|
||||
# print(root)
|
||||
#
|
||||
# for filename in files:
|
||||
# full_path = os.path.join(root, filename)
|
||||
#
|
||||
# doc = dict()
|
||||
#
|
||||
# doc["md5"] = md5sum(full_path)
|
||||
# doc["path"] = root
|
||||
# doc["name"] = filename
|
||||
# doc["size"] = os.path.getsize(full_path)
|
||||
# doc["mtime"] = int(os.path.getmtime(full_path))
|
||||
#
|
||||
# mime_type = mimetypes.guess_type(full_path)[0]
|
||||
#
|
||||
# if mime_type is not None:
|
||||
#
|
||||
# doc["mime"] = mime_type
|
||||
#
|
||||
# if mime_type.startswith("image"):
|
||||
# try:
|
||||
# width, height = Image.open(full_path).size
|
||||
#
|
||||
# doc["width"] = width
|
||||
# doc["height"] = height
|
||||
# except OSError:
|
||||
# doc.pop('mime', None)
|
||||
# pass
|
||||
# except ValueError:
|
||||
# doc.pop('mime', None)
|
||||
# pass
|
||||
#
|
||||
# docs.append(doc)
|
||||
#
|
||||
# file = open("crawler.json", "w")
|
||||
# file.write(simplejson.dumps(docs))
|
||||
# file.close()
|
||||
#
|
||||
#
|
||||
Reference in New Issue
Block a user