mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-10 14:06:41 +00:00
Added pdf & epub parsing
This commit is contained in:
parent
6d3cceb1b1
commit
17c682a5ef
@ -3,6 +3,7 @@ default_options = {
|
||||
"ThumbnailSize": "275",
|
||||
"ThumbnailColor": "FF00FF",
|
||||
"TextFileContentLength": "8192",
|
||||
"PdfFileContentLength": "8192",
|
||||
"MimeGuesser": "extension", # extension, content
|
||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||
"FileParsers": "media, text, picture, font" # media, text, picture
|
||||
|
@ -4,7 +4,8 @@ import json
|
||||
from multiprocessing import Process, Value
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
|
||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||
PdfFileParser
|
||||
from indexer import Indexer
|
||||
from search import Search
|
||||
from thumbnail import ThumbnailGenerator
|
||||
@ -138,7 +139,8 @@ class TaskManager:
|
||||
MediaFileParser(chksum_calcs),
|
||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
||||
PictureFileParser(chksum_calcs),
|
||||
FontParser(chksum_calcs)],
|
||||
FontParser(chksum_calcs),
|
||||
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||
mime_guesser, self.indexer, directory.id)
|
||||
c.crawl(directory.path, counter)
|
||||
|
||||
|
@ -87,7 +87,6 @@ class Indexer:
|
||||
"title": {"analyzer": "my_nGram", "type": "text"},
|
||||
"genre": {"analyzer": "my_nGram", "type": "text"},
|
||||
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
||||
"font_name": {"analyzer": "my_nGram", "type": "text"},
|
||||
}}, doc_type="file", index=self.index_name)
|
||||
|
||||
self.es.indices.open(index=self.index_name)
|
||||
|
115
parsing.py
115
parsing.py
@ -7,6 +7,15 @@ import json
|
||||
import chardet
|
||||
import html
|
||||
import warnings
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
import html2text
|
||||
from ebooklib import epub
|
||||
import ebooklib
|
||||
from PIL import Image
|
||||
from fontTools.ttLib import TTFont, TTLibError
|
||||
|
||||
@ -242,9 +251,9 @@ class PictureFileParser(GenericFileParser):
|
||||
class TextFileParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_lenght: int):
|
||||
def __init__(self, checksum_calculators: list, content_length: int):
|
||||
super().__init__(checksum_calculators)
|
||||
self.content_lenght = content_lenght
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
|
||||
@ -293,7 +302,7 @@ class TextFileParser(GenericFileParser):
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as text_file:
|
||||
raw_content = text_file.read(self.content_lenght)
|
||||
raw_content = text_file.read(self.content_length)
|
||||
|
||||
chardet.detect(raw_content)
|
||||
encoding = chardet.detect(raw_content)["encoding"]
|
||||
@ -321,7 +330,6 @@ class FontParser(GenericFileParser):
|
||||
def parse(self, full_path: str):
|
||||
|
||||
info = super().parse(full_path)
|
||||
print(info)
|
||||
|
||||
with open(full_path, "rb") as f:
|
||||
|
||||
@ -336,7 +344,7 @@ class FontParser(GenericFileParser):
|
||||
try:
|
||||
for name in font["name"].names:
|
||||
if name.nameID == 4:
|
||||
info["font_name"] = name.toUnicode("replace")
|
||||
info["content"] = name.toUnicode("replace")
|
||||
break
|
||||
except AssertionError:
|
||||
print("Could not read font name for " + full_path)
|
||||
@ -344,3 +352,100 @@ class FontParser(GenericFileParser):
|
||||
print("Could not read font for " + full_path)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class PdfFileParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/pdf", "application/x-pdf"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as f:
|
||||
|
||||
info["content"] = ""
|
||||
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
|
||||
try:
|
||||
if document.is_extractable:
|
||||
resource_manager = PDFResourceManager()
|
||||
la_params = LAParams()
|
||||
|
||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
|
||||
for page in PDFPage.create_pages(document):
|
||||
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
|
||||
for lt_obj in layout:
|
||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||
|
||||
text = lt_obj.get_text()
|
||||
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
print("PDF is not extractable: " + full_path)
|
||||
except ValueError:
|
||||
print("Couldn't parse page for " + full_path)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
class EbookParser(GenericFileParser):
|
||||
is_default = False
|
||||
|
||||
def __init__(self, checksum_calculators: list, content_length: int):
|
||||
super().__init__(checksum_calculators)
|
||||
|
||||
self.content_length = content_length
|
||||
|
||||
self.mime_types = [
|
||||
"application/epub+zip"
|
||||
]
|
||||
|
||||
self.html2text = html2text.HTML2Text()
|
||||
self.html2text.ignore_images = True
|
||||
self.html2text.ignore_emphasis = True
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
book = epub.read_epub(full_path)
|
||||
|
||||
info["content"] = ""
|
||||
|
||||
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||
|
||||
text = self.html2text.handle(text.content.decode("utf-8"))
|
||||
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
|
||||
return info
|
||||
|
||||
|
||||
|
@ -10,4 +10,6 @@ chardet
|
||||
fonttools
|
||||
brotli
|
||||
unicodedata2
|
||||
slate
|
||||
pdfminer.six
|
||||
ebooklib
|
||||
html2text
|
13
spec/EbookParserTest.py
Normal file
13
spec/EbookParserTest.py
Normal file
@ -0,0 +1,13 @@
|
||||
from unittest import TestCase
|
||||
from parsing import EbookParser
|
||||
|
||||
|
||||
class EbookParserTest(TestCase):
|
||||
|
||||
def test_parse_content(self):
|
||||
|
||||
parser = EbookParser([], 1000)
|
||||
|
||||
info = parser.parse("test_files/epub1.epub")
|
||||
|
||||
self.assertEqual(len(info["content"]), 1000)
|
@ -10,7 +10,7 @@ class FontParserTest(TestCase):
|
||||
|
||||
info = parser.parse("test_files/truetype1.ttf")
|
||||
|
||||
self.assertEqual(info["font_name"], "Liberation Mono Bold")
|
||||
self.assertEqual(info["content"], "Liberation Mono Bold")
|
||||
|
||||
def test_parse_name_openType(self):
|
||||
|
||||
@ -18,7 +18,7 @@ class FontParserTest(TestCase):
|
||||
|
||||
info = parser.parse("test_files/opentype1.otf")
|
||||
|
||||
self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
|
||||
self.assertEqual(info["content"], "Linux Biolinum Keyboard O")
|
||||
|
||||
def test_parse_name_woff(self):
|
||||
|
||||
@ -26,7 +26,7 @@ class FontParserTest(TestCase):
|
||||
|
||||
info = parser.parse("test_files/woff.woff")
|
||||
|
||||
self.assertEqual(info["font_name"], "Heart of Gold")
|
||||
self.assertEqual(info["content"], "Heart of Gold")
|
||||
|
||||
def test_parse_name_woff2(self):
|
||||
|
||||
@ -34,4 +34,4 @@ class FontParserTest(TestCase):
|
||||
|
||||
info = parser.parse("test_files/woff2.woff2")
|
||||
|
||||
self.assertEqual(info["font_name"], "Heart of Gold")
|
||||
self.assertEqual(info["content"], "Heart of Gold")
|
||||
|
14
spec/PdfFileParser.py
Normal file
14
spec/PdfFileParser.py
Normal file
@ -0,0 +1,14 @@
|
||||
from unittest import TestCase
|
||||
from parsing import PdfFileParser
|
||||
|
||||
|
||||
class PdfParserTest(TestCase):
|
||||
|
||||
def test_parse_content(self):
|
||||
|
||||
parser = PdfFileParser([], 12488)
|
||||
|
||||
info = parser.parse("test_files/pdf1.pdf")
|
||||
|
||||
self.assertEqual(len(info["content"]), 12488)
|
||||
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
|
BIN
spec/test_files/epub1.epub
Normal file
BIN
spec/test_files/epub1.epub
Normal file
Binary file not shown.
BIN
spec/test_files/pdf1.pdf
Normal file
BIN
spec/test_files/pdf1.pdf
Normal file
Binary file not shown.
159
tmp.py
159
tmp.py
@ -1,56 +1,127 @@
|
||||
from elasticsearch import Elasticsearch
|
||||
from indexer import Indexer
|
||||
import json
|
||||
from crawler import Crawler
|
||||
from indexer import Indexer
|
||||
from parsing import GenericFileParser, Sha256CheckSumCalculator, ExtensionMimeGuesser
|
||||
#!/usr/bin/env python
|
||||
|
||||
es = Elasticsearch()
|
||||
1
|
||||
|
||||
# reset
|
||||
es.indices.delete(index="test")
|
||||
es.indices.create(index="test")
|
||||
es.indices.close(index="test")
|
||||
"""
|
||||
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
|
||||
"""
|
||||
import sys
|
||||
import logging
|
||||
import six
|
||||
import pdfminer.settings
|
||||
pdfminer.settings.STRICT = False
|
||||
import pdfminer.high_level
|
||||
import pdfminer.layout
|
||||
from pdfminer.image import ImageWriter
|
||||
|
||||
|
||||
# # config
|
||||
es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {'
|
||||
'"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {'
|
||||
'"type": "path_hierarchy"}}}}', index="test")
|
||||
def extract_text(files=[], outfile='-',
|
||||
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
|
||||
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||
output_type='text', codec='utf-8', strip_control=False,
|
||||
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||
layoutmode='normal', output_dir=None, debug=False,
|
||||
disable_caching=False, **other):
|
||||
if _py2_no_more_posargs is not None:
|
||||
raise ValueError("Too many positional arguments passed.")
|
||||
if not files:
|
||||
raise ValueError("Must provide files to work upon!")
|
||||
|
||||
es.indices.put_mapping(body='{"properties": {'
|
||||
'"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},'
|
||||
'"suggest-path": {"type": "completion", "analyzer": "keyword"},'
|
||||
'"mime": {"type": "keyword"}'
|
||||
'}}', index="test",doc_type="file" )
|
||||
# If any LAParams group arguments were passed, create an LAParams object and
|
||||
# populate with given args. Otherwise, set it to None.
|
||||
if not no_laparams:
|
||||
laparams = pdfminer.layout.LAParams()
|
||||
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||
paramv = locals().get(param, None)
|
||||
if paramv is not None:
|
||||
setattr(laparams, param, paramv)
|
||||
else:
|
||||
laparams = None
|
||||
|
||||
es.indices.open(index="test")
|
||||
imagewriter = None
|
||||
if output_dir:
|
||||
imagewriter = ImageWriter(output_dir)
|
||||
|
||||
if output_type == "text" and outfile != "-":
|
||||
for override, alttype in ( (".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml"),
|
||||
(".tag", "tag") ):
|
||||
if outfile.endswith(override):
|
||||
output_type = alttype
|
||||
|
||||
if outfile == "-":
|
||||
outfp = sys.stdout
|
||||
if outfp.encoding is not None:
|
||||
codec = 'utf-8'
|
||||
else:
|
||||
outfp = open(outfile, "wb")
|
||||
|
||||
|
||||
# add docs
|
||||
for fname in files:
|
||||
with open(fname, "rb") as fp:
|
||||
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||
return outfp
|
||||
|
||||
# crawler = Crawler([GenericFileParser([Sha256CheckSumCalculator()], ExtensionMimeGuesser())])
|
||||
# crawler.crawl("spec/test_folder")
|
||||
#
|
||||
# indexer = Indexer("test")
|
||||
#
|
||||
# indexer.index(crawler.documents)
|
||||
# main
|
||||
def main(args=None):
|
||||
import argparse
|
||||
P = argparse.ArgumentParser(description=__doc__)
|
||||
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
|
||||
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
||||
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
||||
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
|
||||
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
|
||||
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
|
||||
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
|
||||
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
|
||||
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
|
||||
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
|
||||
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
|
||||
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
|
||||
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
|
||||
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
|
||||
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
|
||||
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
|
||||
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
|
||||
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
|
||||
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||
A = P.parse_args(args=args)
|
||||
|
||||
# search
|
||||
# print(es.search("test", "file", '{"query": {"term": {"name": "spec/test_folder/sub2/"}}}'))
|
||||
# print(es.search("test", "file", '{"query": {"match_all": {}}, "aggs": {"test": {"terms": {"field": "mime"}}}}'))
|
||||
# suggest = es.search("test", "file", '{"suggest": {"path-suggest": {"prefix": "spec/test_folder/sub", "completion": {"field": "suggest-path"}}}}')
|
||||
#
|
||||
# print(suggest["suggest"]["path-suggest"])
|
||||
#
|
||||
# for hit in suggest["suggest"]["path-suggest"][0]["options"]:
|
||||
# print(hit["text"])
|
||||
if A.page_numbers:
|
||||
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||
if A.pagenos:
|
||||
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||
|
||||
# indexer = Indexer("test")
|
||||
imagewriter = None
|
||||
if A.output_dir:
|
||||
imagewriter = ImageWriter(A.output_dir)
|
||||
|
||||
# import time
|
||||
# time.sleep(10)
|
||||
if six.PY2 and sys.stdin.encoding:
|
||||
A.password = A.password.decode(sys.stdin.encoding)
|
||||
|
||||
c = Crawler([])
|
||||
c.countFiles("/")
|
||||
if A.output_type == "text" and A.outfile != "-":
|
||||
for override, alttype in ( (".htm", "html"),
|
||||
(".html", "html"),
|
||||
(".xml", "xml" ),
|
||||
(".tag", "tag" ) ):
|
||||
if A.outfile.endswith(override):
|
||||
A.output_type = alttype
|
||||
|
||||
if A.outfile == "-":
|
||||
outfp = sys.stdout
|
||||
if outfp.encoding is not None:
|
||||
# Why ignore outfp.encoding? :-/ stupid cathal?
|
||||
A.codec = 'utf-8'
|
||||
else:
|
||||
outfp = open(A.outfile, "wb")
|
||||
|
||||
## Test Code
|
||||
outfp = extract_text(**vars(A))
|
||||
outfp.close()
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__': sys.exit(main())
|
||||
|
Loading…
x
Reference in New Issue
Block a user