2018-01-22 20:26:19 -05:00

64 lines
1.6 KiB
Python

import os
import hashlib
import mimetypes
from PIL import Image
import simplejson
rootDir = "/home/simon/Documents"
# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
def md5sum(filename, block_size=65536):
hash = hashlib.md5()
with open(filename, "rb") as f:
for block in iter(lambda: f.read(block_size), b""):
hash.update(block)
return hash.hexdigest()
def crawl(root_dir):
docs = []
for root, subdirs, files in os.walk(root_dir):
print(root)
for filename in files:
full_path = os.path.join(root, filename)
doc = dict()
doc["md5"] = md5sum(os.path.join(root, filename))
doc["path"] = root
doc["name"] = filename
doc["size"] = os.path.getsize(full_path)
doc["mtime"] = int(os.path.getmtime(full_path))
mime_type = mimetypes.guess_type(full_path)[0]
if mime_type is not None:
doc["mime"] = mime_type
if mime_type.startswith("image"):
try:
width, height = Image.open(full_path).size
doc["width"] = width
doc["height"] = height
except OSError:
doc.pop('mime', None)
pass
except ValueError:
doc.pop('mime', None)
pass
docs.append(doc)
file = open("crawler.json", "w")
file.write(simplejson.dumps(docs))
file.close()
crawl(rootDir)