diff --git a/crawler.py b/crawler.py index d57369a..99bf7f9 100644 --- a/crawler.py +++ b/crawler.py @@ -5,7 +5,7 @@ from multiprocessing import Process, Value from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ - PdfFileParser + PdfFileParser, DocxParser from indexer import Indexer from search import Search from thumbnail import ThumbnailGenerator @@ -140,7 +140,8 @@ class TaskManager: TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), PictureFileParser(chksum_calcs), FontParser(chksum_calcs), - PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt + PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt + DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt mime_guesser, self.indexer, directory.id) c.crawl(directory.path, counter) diff --git a/parsing.py b/parsing.py index 25085a3..b9b2a0d 100644 --- a/parsing.py +++ b/parsing.py @@ -7,6 +7,7 @@ import json import chardet import html import warnings +import docx2txt from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage @@ -377,7 +378,10 @@ class PdfFileParser(GenericFileParser): document = PDFDocument(parser) if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": - info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" + if isinstance(document.info[0]["Title"], bytes): + info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" + else: + info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n" try: if document.is_extractable: @@ -449,3 +453,26 @@ class EbookParser(GenericFileParser): return info +class DocxParser(GenericFileParser): + is_default = False + + def __init__(self, checksum_calculators: list, content_length: int): + super().__init__(checksum_calculators) + + self.content_length = content_length + + self.mime_types = [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ] + + def parse(self, full_path: str): + info = super().parse(full_path) + + text = docx2txt.process(full_path) + + if len(text) < self.content_length: + info["content"] = text + else: + info["content"] = text[0:self.content_length] + + return info diff --git a/requirements.txt b/requirements.txt index e1c101f..8e7d3e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ brotli unicodedata2 pdfminer.six ebooklib -html2text \ No newline at end of file +html2text +docx2txt \ No newline at end of file diff --git a/spec/DocxParser_spec.py b/spec/DocxParser_spec.py new file mode 100644 index 0000000..4758605 --- /dev/null +++ b/spec/DocxParser_spec.py @@ -0,0 +1,13 @@ +from unittest import TestCase +from parsing import DocxParser + + +class DocxParserTest(TestCase): + + def test_parse_content(self): + + parser = DocxParser([], 1000) + + info = parser.parse("test_files/docx1.docx") + + self.assertEqual(len(info["content"]), 1000) diff --git a/spec/test_files/docx1.docx b/spec/test_files/docx1.docx new file mode 100644 index 0000000..b6d828f Binary files /dev/null and b/spec/test_files/docx1.docx differ diff --git a/tmp.py b/tmp.py deleted file mode 100644 index 1e8ec0b..0000000 --- a/tmp.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python - -""" -Converts PDF text content (though not images containing text) to plain text, html, xml or "tags". -""" -import sys -import logging -import six -import pdfminer.settings -pdfminer.settings.STRICT = False -import pdfminer.high_level -import pdfminer.layout -from pdfminer.image import ImageWriter - - -def extract_text(files=[], outfile='-', - _py2_no_more_posargs=None, # Bloody Python2 needs a shim - no_laparams=False, all_texts=None, detect_vertical=None, # LAParams - word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams - output_type='text', codec='utf-8', strip_control=False, - maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, - layoutmode='normal', output_dir=None, debug=False, - disable_caching=False, **other): - if _py2_no_more_posargs is not None: - raise ValueError("Too many positional arguments passed.") - if not files: - raise ValueError("Must provide files to work upon!") - - # If any LAParams group arguments were passed, create an LAParams object and - # populate with given args. Otherwise, set it to None. - if not no_laparams: - laparams = pdfminer.layout.LAParams() - for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): - paramv = locals().get(param, None) - if paramv is not None: - setattr(laparams, param, paramv) - else: - laparams = None - - imagewriter = None - if output_dir: - imagewriter = ImageWriter(output_dir) - - if output_type == "text" and outfile != "-": - for override, alttype in ( (".htm", "html"), - (".html", "html"), - (".xml", "xml"), - (".tag", "tag") ): - if outfile.endswith(override): - output_type = alttype - - if outfile == "-": - outfp = sys.stdout - if outfp.encoding is not None: - codec = 'utf-8' - else: - outfp = open(outfile, "wb") - - - for fname in files: - with open(fname, "rb") as fp: - pdfminer.high_level.extract_text_to_fp(fp, **locals()) - return outfp - -# main -def main(args=None): - import argparse - P = argparse.ArgumentParser(description=__doc__) - P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.") - P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") - P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") - P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") - P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") - P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF") - P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)") - P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)") - P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") - P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") - P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") - P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") - P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") - P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") - P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") - P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") - P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") - P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") - P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") - P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") - P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") - P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") - A = P.parse_args(args=args) - - if A.page_numbers: - A.page_numbers = set([x-1 for x in A.page_numbers]) - if A.pagenos: - A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) - - imagewriter = None - if A.output_dir: - imagewriter = ImageWriter(A.output_dir) - - if six.PY2 and sys.stdin.encoding: - A.password = A.password.decode(sys.stdin.encoding) - - if A.output_type == "text" and A.outfile != "-": - for override, alttype in ( (".htm", "html"), - (".html", "html"), - (".xml", "xml" ), - (".tag", "tag" ) ): - if A.outfile.endswith(override): - A.output_type = alttype - - if A.outfile == "-": - outfp = sys.stdout - if outfp.encoding is not None: - # Why ignore outfp.encoding? :-/ stupid cathal? - A.codec = 'utf-8' - else: - outfp = open(A.outfile, "wb") - - ## Test Code - outfp = extract_text(**vars(A)) - outfp.close() - return 0 - - -if __name__ == '__main__': sys.exit(main())