Added docx parsing

This commit is contained in:
simon987
2018-04-16 20:04:45 -04:00
parent 17c682a5ef
commit 7e5d6fe1ac
6 changed files with 46 additions and 131 deletions

View File

@@ -5,7 +5,7 @@ from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser
PdfFileParser, DocxParser
from indexer import Indexer
from search import Search
from thumbnail import ThumbnailGenerator
@@ -140,7 +140,8 @@ class TaskManager:
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
PictureFileParser(chksum_calcs),
FontParser(chksum_calcs),
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter)