Added web interface, crawler and more work on local storage

2025-12-15 16:19:05 +00:00 · 2018-02-21 20:07:59 -05:00
parent de0a835ecd
commit 165844e4ca
24 changed files with 1346 additions and 235 deletions
--- a/crawler.py
+++ b/crawler.py
@@ -1,152 +1,33 @@
 import os
-import hashlib


 class Crawler:
-    pass
+
+    def __init__(self, enabled_parsers: list):
+        self.documents = []
+        self.enabled_parsers = enabled_parsers
+
+    def crawl(self, root_dir: str):
+        for root, dirs, files in os.walk(root_dir):
+
+            for filename in files:
+                full_path = os.path.join(root, filename)
+
+                parser = self.get_parser_by_ext(os.path.splitext(filename)[1])
+
+                doc = parser.parse(full_path)
+
+                self.documents.append(doc)
+
+    def get_parser_by_ext(self, ext: str):
+
+        for parser in self.enabled_parsers:
+
+            if ext in parser.extensions:
+                return parser
+
+        for parser in self.enabled_parsers:
+            if parser.is_default:
+                return parser


-class FileParser:
-    pass
-
-
-class FileCheckSumCalculator:
-
-    def checksum(self, path: str) -> str:
-        """
-        Calculate the checksum of a file
-        :param path: path of the file
-        :return: checksum
-        """
-        raise NotImplementedError()
-
-
-class Md5CheckSumCalculator(FileCheckSumCalculator):
-
-    def __init__(self):
-        self.name = "md5"
-
-    def checksum(self, path: str) -> str:
-        """
-        Calculate the md5 checksum of a file
-        :param path: path of the file
-        :return: md5 checksum
-        """
-        result = hashlib.md5()
-
-        with open(path, "rb") as f:
-            for block in iter(lambda: f.read(65536), b""):
-                result.update(block)
-
-        return result.hexdigest().upper()
-
-
-class Sha1CheckSumCalculator(FileCheckSumCalculator):
-
-    def __init__(self):
-        self.name = "sha1"
-
-    def checksum(self, path: str) -> str:
-        """
-        Calculate the sha1 checksum of a file
-        :param path: path of the file
-        :return: sha1 checksum
-        """
-        result = hashlib.sha1()
-
-        with open(path, "rb") as f:
-            for block in iter(lambda: f.read(65536), b""):
-                result.update(block)
-
-        return result.hexdigest().upper()
-
-
-class Sha256CheckSumCalculator(FileCheckSumCalculator):
-
-    def __init__(self):
-        self.name = "sha256"
-
-    def checksum(self, path: str) -> str:
-        """
-        Calculate the sha256 checksum of a file
-        :param path: path of the file
-        :return: sha256 checksum
-        """
-        result = hashlib.sha256()
-
-        with open(path, "rb") as f:
-            for block in iter(lambda: f.read(65536), b""):
-                result.update(block)
-
-        return result.hexdigest().upper()
-
-
-class GenericFileParser(FileParser):
-
-    def __init__(self, checksum_calculators: list):
-        self.checksum_calculators = checksum_calculators
-
-    def parse(self, path: str) -> dict:
-        """
-        Parse a generic file
-        :param path: path of the file to parse
-        :return: dict information about the file
-        """
-
-        info = dict()
-
-        info["size"] = os.path.getsize(path)
-        info["name"] = os.path.splitext(path)[0]
-
-        for calculator in self.checksum_calculators:
-            info[calculator.name] = calculator.checksum(path)
-
-        return info
-
-
-
-
-# def crawl(root_dir: str) -> None:
-#     docs = []
-#
-#     for root, dirs, files in os.walk(root_dir):
-#
-#         print(root)
-#
-#         for filename in files:
-#             full_path = os.path.join(root, filename)
-#
-#             doc = dict()
-#
-#             doc["md5"] = md5sum(full_path)
-#             doc["path"] = root
-#             doc["name"] = filename
-#             doc["size"] = os.path.getsize(full_path)
-#             doc["mtime"] = int(os.path.getmtime(full_path))
-#
-#             mime_type = mimetypes.guess_type(full_path)[0]
-#
-#             if mime_type is not None:
-#
-#                 doc["mime"] = mime_type
-#
-#                 if mime_type.startswith("image"):
-#                     try:
-#                         width, height = Image.open(full_path).size
-#
-#                         doc["width"] = width
-#                         doc["height"] = height
-#                     except OSError:
-#                         doc.pop('mime', None)
-#                         pass
-#                     except ValueError:
-#                         doc.pop('mime', None)
-#                         pass
-#
-#             docs.append(doc)
-#
-#     file = open("crawler.json", "w")
-#     file.write(simplejson.dumps(docs))
-#     file.close()
-#
-#