mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
Added docx parsing
This commit is contained in:
parent
17c682a5ef
commit
7e5d6fe1ac
@ -5,7 +5,7 @@ from multiprocessing import Process, Value
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||
PdfFileParser
|
||||
PdfFileParser, DocxParser
|
||||
from indexer import Indexer
|
||||
from search import Search
|
||||
from thumbnail import ThumbnailGenerator
|
||||
@ -140,7 +140,8 @@ class TaskManager:
|
||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
||||
PictureFileParser(chksum_calcs),
|
||||
FontParser(chksum_calcs),
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
|
||||
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||
mime_guesser, self.indexer, directory.id)
|
||||
c.crawl(directory.path, counter)
|
||||
|
||||
|
29
parsing.py
29
parsing.py
@ -7,6 +7,7 @@ import json
|
||||
import chardet
|
||||
import html
|
||||
import warnings
|
||||
import docx2txt
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
@ -377,7 +378,10 @@ class PdfFileParser(GenericFileParser):
|
||||
document = PDFDocument(parser)
|
||||
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
if isinstance(document.info[0]["Title"], bytes):
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
else:
|
||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||
|
||||
try:
|
||||
if document.is_extractable:
|
||||
@ -449,3 +453,26 @@ class EbookParser(GenericFileParser):
|
||||
return info
|
||||
|
||||
|
||||
class DocxParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
text = docx2txt.process(full_path)
|
||||
|
||||
if len(text) < self.content_length:
|
||||
info["content"] = text
|
||||
else:
|
||||
info["content"] = text[0:self.content_length]
|
||||
|
||||
return info
|
||||
|
@ -12,4 +12,5 @@ brotli
|
||||
unicodedata2
|
||||
pdfminer.six
|
||||
ebooklib
|
||||
html2text
|
||||
html2text
|
||||
docx2txt
|
13
spec/DocxParser_spec.py
Normal file
13
spec/DocxParser_spec.py
Normal file
@ -0,0 +1,13 @@
|
||||
from unittest import TestCase
|
||||
from parsing import DocxParser
|
||||
|
||||
|
||||
class DocxParserTest(TestCase):
|
||||
|
||||
def test_parse_content(self):
|
||||
|
||||
parser = DocxParser([], 1000)
|
||||
|
||||
info = parser.parse("test_files/docx1.docx")
|
||||
|
||||
self.assertEqual(len(info["content"]), 1000)
|
BIN
spec/test_files/docx1.docx
Normal file
BIN
spec/test_files/docx1.docx
Normal file
Binary file not shown.
127
tmp.py
127
tmp.py
@ -1,127 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
"""
|
||||
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
|
||||
"""
|
||||
import sys
|
||||
import logging
|
||||
import six
|
||||
import pdfminer.settings
|
||||
pdfminer.settings.STRICT = False
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
from pdfminer.image import ImageWriter
|
||||
|
||||
|
||||
def extract_text(files=[], outfile='-',
|
||||
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
|
||||
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||
output_type='text', codec='utf-8', strip_control=False,
|
||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||
layoutmode='normal', output_dir=None, debug=False,
|
||||
disable_caching=False, **other):
|
||||
if _py2_no_more_posargs is not None:
|
||||
raise ValueError("Too many positional arguments passed.")
|
||||
if not files:
|
||||
raise ValueError("Must provide files to work upon!")
|
||||
|
||||
# If any LAParams group arguments were passed, create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if not no_laparams:
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = locals().get(param, None)
|
||||
if paramv is not None:
|
||||
setattr(laparams, param, paramv)
|
||||
else:
|
||||
laparams = None
|
||||
|
||||
imagewriter = None
|
||||
if output_dir:
|
||||
imagewriter = ImageWriter(output_dir)
|
||||
|
||||
if output_type == "text" and outfile != "-":
|
||||
for override, alttype in ( (".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml"),
|
||||
(".tag", "tag") ):
|
||||
if outfile.endswith(override):
|
||||
output_type = alttype
|
||||
|
||||
if outfile == "-":
|
||||
outfp = sys.stdout
|
||||
if outfp.encoding is not None:
|
||||
codec = 'utf-8'
|
||||
else:
|
||||
outfp = open(outfile, "wb")
|
||||
|
||||
|
||||
for fname in files:
|
||||
with open(fname, "rb") as fp:
|
||||
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||
return outfp
|
||||
|
||||
# main
|
||||
def main(args=None):
|
||||
import argparse
|
||||
P = argparse.ArgumentParser(description=__doc__)
|
||||
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
|
||||
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
||||
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
|
||||
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
|
||||
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
|
||||
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
|
||||
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
|
||||
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
|
||||
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
|
||||
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
|
||||
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
|
||||
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
|
||||
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
|
||||
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
|
||||
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
|
||||
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
|
||||
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
|
||||
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||
A = P.parse_args(args=args)
|
||||
|
||||
if A.page_numbers:
|
||||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||
if A.pagenos:
|
||||
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||
|
||||
imagewriter = None
|
||||
if A.output_dir:
|
||||
imagewriter = ImageWriter(A.output_dir)
|
||||
|
||||
if six.PY2 and sys.stdin.encoding:
|
||||
A.password = A.password.decode(sys.stdin.encoding)
|
||||
|
||||
if A.output_type == "text" and A.outfile != "-":
|
||||
for override, alttype in ( (".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml" ),
|
||||
(".tag", "tag" ) ):
|
||||
if A.outfile.endswith(override):
|
||||
A.output_type = alttype
|
||||
|
||||
if A.outfile == "-":
|
||||
outfp = sys.stdout
|
||||
if outfp.encoding is not None:
|
||||
# Why ignore outfp.encoding? :-/ stupid cathal?
|
||||
A.codec = 'utf-8'
|
||||
else:
|
||||
outfp = open(A.outfile, "wb")
|
||||
|
||||
## Test Code
|
||||
outfp = extract_text(**vars(A))
|
||||
outfp.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__': sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user