Added docx parsing

This commit is contained in:
simon987 2018-04-16 20:04:45 -04:00
parent 17c682a5ef
commit 7e5d6fe1ac
6 changed files with 46 additions and 131 deletions

View File

@ -5,7 +5,7 @@ from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser PdfFileParser, DocxParser
from indexer import Indexer from indexer import Indexer
from search import Search from search import Search
from thumbnail import ThumbnailGenerator from thumbnail import ThumbnailGenerator
@ -140,7 +140,8 @@ class TaskManager:
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
PictureFileParser(chksum_calcs), PictureFileParser(chksum_calcs),
FontParser(chksum_calcs), FontParser(chksum_calcs),
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
mime_guesser, self.indexer, directory.id) mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter) c.crawl(directory.path, counter)

View File

@ -7,6 +7,7 @@ import json
import chardet import chardet
import html import html
import warnings import warnings
import docx2txt
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
@ -377,7 +378,10 @@ class PdfFileParser(GenericFileParser):
document = PDFDocument(parser) document = PDFDocument(parser)
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
if isinstance(document.info[0]["Title"], bytes):
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
else:
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
try: try:
if document.is_extractable: if document.is_extractable:
@ -449,3 +453,26 @@ class EbookParser(GenericFileParser):
return info return info
class DocxParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int):
super().__init__(checksum_calculators)
self.content_length = content_length
self.mime_types = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
]
def parse(self, full_path: str):
info = super().parse(full_path)
text = docx2txt.process(full_path)
if len(text) < self.content_length:
info["content"] = text
else:
info["content"] = text[0:self.content_length]
return info

View File

@ -13,3 +13,4 @@ unicodedata2
pdfminer.six pdfminer.six
ebooklib ebooklib
html2text html2text
docx2txt

13
spec/DocxParser_spec.py Normal file
View File

@ -0,0 +1,13 @@
from unittest import TestCase
from parsing import DocxParser
class DocxParserTest(TestCase):
def test_parse_content(self):
parser = DocxParser([], 1000)
info = parser.parse("test_files/docx1.docx")
self.assertEqual(len(info["content"]), 1000)

BIN
spec/test_files/docx1.docx Normal file

Binary file not shown.

127
tmp.py
View File

@ -1,127 +0,0 @@
#!/usr/bin/env python
"""
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
"""
import sys
import logging
import six
import pdfminer.settings
pdfminer.settings.STRICT = False
import pdfminer.high_level
import pdfminer.layout
from pdfminer.image import ImageWriter
def extract_text(files=[], outfile='-',
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
output_type='text', codec='utf-8', strip_control=False,
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
layoutmode='normal', output_dir=None, debug=False,
disable_caching=False, **other):
if _py2_no_more_posargs is not None:
raise ValueError("Too many positional arguments passed.")
if not files:
raise ValueError("Must provide files to work upon!")
# If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
else:
laparams = None
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
if output_type == "text" and outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if outfile.endswith(override):
output_type = alttype
if outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
codec = 'utf-8'
else:
outfp = open(outfile, "wb")
for fname in files:
with open(fname, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp
# main
def main(args=None):
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args)
if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)
if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding)
if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml" ),
(".tag", "tag" ) ):
if A.outfile.endswith(override):
A.output_type = alttype
if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
# Why ignore outfp.encoding? :-/ stupid cathal?
A.codec = 'utf-8'
else:
outfp = open(A.outfile, "wb")
## Test Code
outfp = extract_text(**vars(A))
outfp.close()
return 0
if __name__ == '__main__': sys.exit(main())