Added pdf & epub parsing

This commit is contained in:
simon987 2018-04-16 19:42:40 -04:00
parent 6d3cceb1b1
commit 17c682a5ef
11 changed files with 264 additions and 57 deletions

View File

@ -3,6 +3,7 @@ default_options = {
"ThumbnailSize": "275",
"ThumbnailColor": "FF00FF",
"TextFileContentLength": "8192",
"PdfFileContentLength": "8192",
"MimeGuesser": "extension", # extension, content
"CheckSumCalculators": "", # md5, sha1, sha256
"FileParsers": "media, text, picture, font" # media, text, picture

View File

@ -4,7 +4,8 @@ import json
from multiprocessing import Process, Value
from apscheduler.schedulers.background import BackgroundScheduler
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
PdfFileParser
from indexer import Indexer
from search import Search
from thumbnail import ThumbnailGenerator
@ -138,7 +139,8 @@ class TaskManager:
MediaFileParser(chksum_calcs),
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
PictureFileParser(chksum_calcs),
FontParser(chksum_calcs)],
FontParser(chksum_calcs),
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
mime_guesser, self.indexer, directory.id)
c.crawl(directory.path, counter)

View File

@ -87,7 +87,6 @@ class Indexer:
"title": {"analyzer": "my_nGram", "type": "text"},
"genre": {"analyzer": "my_nGram", "type": "text"},
"album_artist": {"analyzer": "my_nGram", "type": "text"},
"font_name": {"analyzer": "my_nGram", "type": "text"},
}}, doc_type="file", index=self.index_name)
self.es.indices.open(index=self.index_name)

View File

@ -7,6 +7,15 @@ import json
import chardet
import html
import warnings
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator
import html2text
from ebooklib import epub
import ebooklib
from PIL import Image
from fontTools.ttLib import TTFont, TTLibError
@ -242,9 +251,9 @@ class PictureFileParser(GenericFileParser):
class TextFileParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_lenght: int):
def __init__(self, checksum_calculators: list, content_length: int):
super().__init__(checksum_calculators)
self.content_lenght = content_lenght
self.content_length = content_length
self.mime_types = [
"text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
@ -293,7 +302,7 @@ class TextFileParser(GenericFileParser):
info = super().parse(full_path)
with open(full_path, "rb") as text_file:
raw_content = text_file.read(self.content_lenght)
raw_content = text_file.read(self.content_length)
chardet.detect(raw_content)
encoding = chardet.detect(raw_content)["encoding"]
@ -321,7 +330,6 @@ class FontParser(GenericFileParser):
def parse(self, full_path: str):
info = super().parse(full_path)
print(info)
with open(full_path, "rb") as f:
@ -336,7 +344,7 @@ class FontParser(GenericFileParser):
try:
for name in font["name"].names:
if name.nameID == 4:
info["font_name"] = name.toUnicode("replace")
info["content"] = name.toUnicode("replace")
break
except AssertionError:
print("Could not read font name for " + full_path)
@ -344,3 +352,100 @@ class FontParser(GenericFileParser):
print("Could not read font for " + full_path)
return info
class PdfFileParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int):
super().__init__(checksum_calculators)
self.content_length = content_length
self.mime_types = [
"application/pdf", "application/x-pdf"
]
def parse(self, full_path: str):
info = super().parse(full_path)
with open(full_path, "rb") as f:
info["content"] = ""
parser = PDFParser(f)
document = PDFDocument(parser)
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
try:
if document.is_extractable:
resource_manager = PDFResourceManager()
la_params = LAParams()
device = PDFPageAggregator(resource_manager, laparams=la_params)
interpreter = PDFPageInterpreter(resource_manager, device)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text = lt_obj.get_text()
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
else:
continue
break
else:
print("PDF is not extractable: " + full_path)
except ValueError:
print("Couldn't parse page for " + full_path)
return info
class EbookParser(GenericFileParser):
is_default = False
def __init__(self, checksum_calculators: list, content_length: int):
super().__init__(checksum_calculators)
self.content_length = content_length
self.mime_types = [
"application/epub+zip"
]
self.html2text = html2text.HTML2Text()
self.html2text.ignore_images = True
self.html2text.ignore_emphasis = True
def parse(self, full_path: str):
info = super().parse(full_path)
book = epub.read_epub(full_path)
info["content"] = ""
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
text = self.html2text.handle(text.content.decode("utf-8"))
if len(info["content"]) + len(text) <= self.content_length:
info["content"] += text
else:
info["content"] += text[0:self.content_length - len(info["content"])]
break
return info

View File

@ -10,4 +10,6 @@ chardet
fonttools
brotli
unicodedata2
slate
pdfminer.six
ebooklib
html2text

13
spec/EbookParserTest.py Normal file
View File

@ -0,0 +1,13 @@
from unittest import TestCase
from parsing import EbookParser
class EbookParserTest(TestCase):
def test_parse_content(self):
parser = EbookParser([], 1000)
info = parser.parse("test_files/epub1.epub")
self.assertEqual(len(info["content"]), 1000)

View File

@ -10,7 +10,7 @@ class FontParserTest(TestCase):
info = parser.parse("test_files/truetype1.ttf")
self.assertEqual(info["font_name"], "Liberation Mono Bold")
self.assertEqual(info["content"], "Liberation Mono Bold")
def test_parse_name_openType(self):
@ -18,7 +18,7 @@ class FontParserTest(TestCase):
info = parser.parse("test_files/opentype1.otf")
self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
self.assertEqual(info["content"], "Linux Biolinum Keyboard O")
def test_parse_name_woff(self):
@ -26,7 +26,7 @@ class FontParserTest(TestCase):
info = parser.parse("test_files/woff.woff")
self.assertEqual(info["font_name"], "Heart of Gold")
self.assertEqual(info["content"], "Heart of Gold")
def test_parse_name_woff2(self):
@ -34,4 +34,4 @@ class FontParserTest(TestCase):
info = parser.parse("test_files/woff2.woff2")
self.assertEqual(info["font_name"], "Heart of Gold")
self.assertEqual(info["content"], "Heart of Gold")

14
spec/PdfFileParser.py Normal file
View File

@ -0,0 +1,14 @@
from unittest import TestCase
from parsing import PdfFileParser
class PdfParserTest(TestCase):
def test_parse_content(self):
parser = PdfFileParser([], 12488)
info = parser.parse("test_files/pdf1.pdf")
self.assertEqual(len(info["content"]), 12488)
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))

BIN
spec/test_files/epub1.epub Normal file

Binary file not shown.

BIN
spec/test_files/pdf1.pdf Normal file

Binary file not shown.

159
tmp.py
View File

@ -1,56 +1,127 @@
from elasticsearch import Elasticsearch
from indexer import Indexer
import json
from crawler import Crawler
from indexer import Indexer
from parsing import GenericFileParser, Sha256CheckSumCalculator, ExtensionMimeGuesser
#!/usr/bin/env python
es = Elasticsearch()
1
# reset
es.indices.delete(index="test")
es.indices.create(index="test")
es.indices.close(index="test")
"""
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
"""
import sys
import logging
import six
import pdfminer.settings
pdfminer.settings.STRICT = False
import pdfminer.high_level
import pdfminer.layout
from pdfminer.image import ImageWriter
# # config
es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {'
'"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {'
'"type": "path_hierarchy"}}}}', index="test")
def extract_text(files=[], outfile='-',
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
output_type='text', codec='utf-8', strip_control=False,
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
layoutmode='normal', output_dir=None, debug=False,
disable_caching=False, **other):
if _py2_no_more_posargs is not None:
raise ValueError("Too many positional arguments passed.")
if not files:
raise ValueError("Must provide files to work upon!")
es.indices.put_mapping(body='{"properties": {'
'"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},'
'"suggest-path": {"type": "completion", "analyzer": "keyword"},'
'"mime": {"type": "keyword"}'
'}}', index="test",doc_type="file" )
# If any LAParams group arguments were passed, create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams = pdfminer.layout.LAParams()
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
else:
laparams = None
es.indices.open(index="test")
imagewriter = None
if output_dir:
imagewriter = ImageWriter(output_dir)
if output_type == "text" and outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml"),
(".tag", "tag") ):
if outfile.endswith(override):
output_type = alttype
if outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
codec = 'utf-8'
else:
outfp = open(outfile, "wb")
# add docs
for fname in files:
with open(fname, "rb") as fp:
pdfminer.high_level.extract_text_to_fp(fp, **locals())
return outfp
# crawler = Crawler([GenericFileParser([Sha256CheckSumCalculator()], ExtensionMimeGuesser())])
# crawler.crawl("spec/test_folder")
#
# indexer = Indexer("test")
#
# indexer.index(crawler.documents)
# main
def main(args=None):
import argparse
P = argparse.ArgumentParser(description=__doc__)
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
A = P.parse_args(args=args)
# search
# print(es.search("test", "file", '{"query": {"term": {"name": "spec/test_folder/sub2/"}}}'))
# print(es.search("test", "file", '{"query": {"match_all": {}}, "aggs": {"test": {"terms": {"field": "mime"}}}}'))
# suggest = es.search("test", "file", '{"suggest": {"path-suggest": {"prefix": "spec/test_folder/sub", "completion": {"field": "suggest-path"}}}}')
#
# print(suggest["suggest"]["path-suggest"])
#
# for hit in suggest["suggest"]["path-suggest"][0]["options"]:
# print(hit["text"])
if A.page_numbers:
A.page_numbers = set([x-1 for x in A.page_numbers])
if A.pagenos:
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
# indexer = Indexer("test")
imagewriter = None
if A.output_dir:
imagewriter = ImageWriter(A.output_dir)
# import time
# time.sleep(10)
if six.PY2 and sys.stdin.encoding:
A.password = A.password.decode(sys.stdin.encoding)
c = Crawler([])
c.countFiles("/")
if A.output_type == "text" and A.outfile != "-":
for override, alttype in ( (".htm", "html"),
(".html", "html"),
(".xml", "xml" ),
(".tag", "tag" ) ):
if A.outfile.endswith(override):
A.output_type = alttype
if A.outfile == "-":
outfp = sys.stdout
if outfp.encoding is not None:
# Why ignore outfp.encoding? :-/ stupid cathal?
A.codec = 'utf-8'
else:
outfp = open(A.outfile, "wb")
## Test Code
outfp = extract_text(**vars(A))
outfp.close()
return 0
if __name__ == '__main__': sys.exit(main())