mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-19 18:16:45 +00:00
Added docx parsing
This commit is contained in:
parent
17c682a5ef
commit
7e5d6fe1ac
@ -5,7 +5,7 @@ from multiprocessing import Process, Value
|
|||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||||
PdfFileParser
|
PdfFileParser, DocxParser
|
||||||
from indexer import Indexer
|
from indexer import Indexer
|
||||||
from search import Search
|
from search import Search
|
||||||
from thumbnail import ThumbnailGenerator
|
from thumbnail import ThumbnailGenerator
|
||||||
@ -140,7 +140,8 @@ class TaskManager:
|
|||||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
||||||
PictureFileParser(chksum_calcs),
|
PictureFileParser(chksum_calcs),
|
||||||
FontParser(chksum_calcs),
|
FontParser(chksum_calcs),
|
||||||
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), # todo get content len from other opt
|
||||||
|
DocxParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||||
mime_guesser, self.indexer, directory.id)
|
mime_guesser, self.indexer, directory.id)
|
||||||
c.crawl(directory.path, counter)
|
c.crawl(directory.path, counter)
|
||||||
|
|
||||||
|
27
parsing.py
27
parsing.py
@ -7,6 +7,7 @@ import json
|
|||||||
import chardet
|
import chardet
|
||||||
import html
|
import html
|
||||||
import warnings
|
import warnings
|
||||||
|
import docx2txt
|
||||||
from pdfminer.pdfparser import PDFParser
|
from pdfminer.pdfparser import PDFParser
|
||||||
from pdfminer.pdfdocument import PDFDocument
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
from pdfminer.pdfpage import PDFPage
|
from pdfminer.pdfpage import PDFPage
|
||||||
@ -377,7 +378,10 @@ class PdfFileParser(GenericFileParser):
|
|||||||
document = PDFDocument(parser)
|
document = PDFDocument(parser)
|
||||||
|
|
||||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||||
|
if isinstance(document.info[0]["Title"], bytes):
|
||||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||||
|
else:
|
||||||
|
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if document.is_extractable:
|
if document.is_extractable:
|
||||||
@ -449,3 +453,26 @@ class EbookParser(GenericFileParser):
|
|||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
class DocxParser(GenericFileParser):
|
||||||
|
is_default = False
|
||||||
|
|
||||||
|
def __init__(self, checksum_calculators: list, content_length: int):
|
||||||
|
super().__init__(checksum_calculators)
|
||||||
|
|
||||||
|
self.content_length = content_length
|
||||||
|
|
||||||
|
self.mime_types = [
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, full_path: str):
|
||||||
|
info = super().parse(full_path)
|
||||||
|
|
||||||
|
text = docx2txt.process(full_path)
|
||||||
|
|
||||||
|
if len(text) < self.content_length:
|
||||||
|
info["content"] = text
|
||||||
|
else:
|
||||||
|
info["content"] = text[0:self.content_length]
|
||||||
|
|
||||||
|
return info
|
||||||
|
@ -13,3 +13,4 @@ unicodedata2
|
|||||||
pdfminer.six
|
pdfminer.six
|
||||||
ebooklib
|
ebooklib
|
||||||
html2text
|
html2text
|
||||||
|
docx2txt
|
13
spec/DocxParser_spec.py
Normal file
13
spec/DocxParser_spec.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from unittest import TestCase
|
||||||
|
from parsing import DocxParser
|
||||||
|
|
||||||
|
|
||||||
|
class DocxParserTest(TestCase):
|
||||||
|
|
||||||
|
def test_parse_content(self):
|
||||||
|
|
||||||
|
parser = DocxParser([], 1000)
|
||||||
|
|
||||||
|
info = parser.parse("test_files/docx1.docx")
|
||||||
|
|
||||||
|
self.assertEqual(len(info["content"]), 1000)
|
BIN
spec/test_files/docx1.docx
Normal file
BIN
spec/test_files/docx1.docx
Normal file
Binary file not shown.
127
tmp.py
127
tmp.py
@ -1,127 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
|
|
||||||
"""
|
|
||||||
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
|
|
||||||
"""
|
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
import six
|
|
||||||
import pdfminer.settings
|
|
||||||
pdfminer.settings.STRICT = False
|
|
||||||
import pdfminer.high_level
|
|
||||||
import pdfminer.layout
|
|
||||||
from pdfminer.image import ImageWriter
|
|
||||||
|
|
||||||
|
|
||||||
def extract_text(files=[], outfile='-',
|
|
||||||
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
|
|
||||||
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
|
||||||
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
|
||||||
output_type='text', codec='utf-8', strip_control=False,
|
|
||||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
|
||||||
layoutmode='normal', output_dir=None, debug=False,
|
|
||||||
disable_caching=False, **other):
|
|
||||||
if _py2_no_more_posargs is not None:
|
|
||||||
raise ValueError("Too many positional arguments passed.")
|
|
||||||
if not files:
|
|
||||||
raise ValueError("Must provide files to work upon!")
|
|
||||||
|
|
||||||
# If any LAParams group arguments were passed, create an LAParams object and
|
|
||||||
# populate with given args. Otherwise, set it to None.
|
|
||||||
if not no_laparams:
|
|
||||||
laparams = pdfminer.layout.LAParams()
|
|
||||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
|
||||||
paramv = locals().get(param, None)
|
|
||||||
if paramv is not None:
|
|
||||||
setattr(laparams, param, paramv)
|
|
||||||
else:
|
|
||||||
laparams = None
|
|
||||||
|
|
||||||
imagewriter = None
|
|
||||||
if output_dir:
|
|
||||||
imagewriter = ImageWriter(output_dir)
|
|
||||||
|
|
||||||
if output_type == "text" and outfile != "-":
|
|
||||||
for override, alttype in ( (".htm", "html"),
|
|
||||||
(".html", "html"),
|
|
||||||
(".xml", "xml"),
|
|
||||||
(".tag", "tag") ):
|
|
||||||
if outfile.endswith(override):
|
|
||||||
output_type = alttype
|
|
||||||
|
|
||||||
if outfile == "-":
|
|
||||||
outfp = sys.stdout
|
|
||||||
if outfp.encoding is not None:
|
|
||||||
codec = 'utf-8'
|
|
||||||
else:
|
|
||||||
outfp = open(outfile, "wb")
|
|
||||||
|
|
||||||
|
|
||||||
for fname in files:
|
|
||||||
with open(fname, "rb") as fp:
|
|
||||||
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
|
||||||
return outfp
|
|
||||||
|
|
||||||
# main
|
|
||||||
def main(args=None):
|
|
||||||
import argparse
|
|
||||||
P = argparse.ArgumentParser(description=__doc__)
|
|
||||||
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
|
|
||||||
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
|
||||||
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
|
||||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
|
||||||
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
|
||||||
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
|
||||||
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
|
|
||||||
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
|
|
||||||
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
|
|
||||||
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
|
|
||||||
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
|
|
||||||
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
|
|
||||||
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
|
|
||||||
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
|
|
||||||
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
|
|
||||||
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
|
|
||||||
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
|
|
||||||
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
|
|
||||||
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
|
|
||||||
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
|
|
||||||
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
|
|
||||||
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
|
||||||
A = P.parse_args(args=args)
|
|
||||||
|
|
||||||
if A.page_numbers:
|
|
||||||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
|
||||||
if A.pagenos:
|
|
||||||
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
|
||||||
|
|
||||||
imagewriter = None
|
|
||||||
if A.output_dir:
|
|
||||||
imagewriter = ImageWriter(A.output_dir)
|
|
||||||
|
|
||||||
if six.PY2 and sys.stdin.encoding:
|
|
||||||
A.password = A.password.decode(sys.stdin.encoding)
|
|
||||||
|
|
||||||
if A.output_type == "text" and A.outfile != "-":
|
|
||||||
for override, alttype in ( (".htm", "html"),
|
|
||||||
(".html", "html"),
|
|
||||||
(".xml", "xml" ),
|
|
||||||
(".tag", "tag" ) ):
|
|
||||||
if A.outfile.endswith(override):
|
|
||||||
A.output_type = alttype
|
|
||||||
|
|
||||||
if A.outfile == "-":
|
|
||||||
outfp = sys.stdout
|
|
||||||
if outfp.encoding is not None:
|
|
||||||
# Why ignore outfp.encoding? :-/ stupid cathal?
|
|
||||||
A.codec = 'utf-8'
|
|
||||||
else:
|
|
||||||
outfp = open(A.outfile, "wb")
|
|
||||||
|
|
||||||
## Test Code
|
|
||||||
outfp = extract_text(**vars(A))
|
|
||||||
outfp.close()
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__': sys.exit(main())
|
|
Loading…
x
Reference in New Issue
Block a user