diff --git a/config.py b/config.py index 6d87da2..09e876e 100644 --- a/config.py +++ b/config.py @@ -3,6 +3,7 @@ default_options = { "ThumbnailSize": "275", "ThumbnailColor": "FF00FF", "TextFileContentLength": "8192", + "PdfFileContentLength": "8192", "MimeGuesser": "extension", # extension, content "CheckSumCalculators": "", # md5, sha1, sha256 "FileParsers": "media, text, picture, font" # media, text, picture diff --git a/crawler.py b/crawler.py index 7c9c7cd..d57369a 100644 --- a/crawler.py +++ b/crawler.py @@ -4,7 +4,8 @@ import json from multiprocessing import Process, Value from apscheduler.schedulers.background import BackgroundScheduler from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \ - PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser + PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \ + PdfFileParser from indexer import Indexer from search import Search from thumbnail import ThumbnailGenerator @@ -138,7 +139,8 @@ class TaskManager: MediaFileParser(chksum_calcs), TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))), PictureFileParser(chksum_calcs), - FontParser(chksum_calcs)], + FontParser(chksum_calcs), + PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt mime_guesser, self.indexer, directory.id) c.crawl(directory.path, counter) diff --git a/indexer.py b/indexer.py index 364d2ed..1f541aa 100644 --- a/indexer.py +++ b/indexer.py @@ -87,7 +87,6 @@ class Indexer: "title": {"analyzer": "my_nGram", "type": "text"}, "genre": {"analyzer": "my_nGram", "type": "text"}, "album_artist": {"analyzer": "my_nGram", "type": "text"}, - "font_name": {"analyzer": "my_nGram", "type": "text"}, }}, doc_type="file", index=self.index_name) self.es.indices.open(index=self.index_name) diff --git a/parsing.py b/parsing.py index 050f553..25085a3 100644 --- a/parsing.py +++ b/parsing.py @@ -7,6 +7,15 @@ import json import chardet import html import warnings +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.layout import LAParams, LTTextBox, LTTextLine +from pdfminer.converter import PDFPageAggregator +import html2text +from ebooklib import epub +import ebooklib from PIL import Image from fontTools.ttLib import TTFont, TTLibError @@ -242,9 +251,9 @@ class PictureFileParser(GenericFileParser): class TextFileParser(GenericFileParser): is_default = False - def __init__(self, checksum_calculators: list, content_lenght: int): + def __init__(self, checksum_calculators: list, content_length: int): super().__init__(checksum_calculators) - self.content_lenght = content_lenght + self.content_length = content_length self.mime_types = [ "text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript", @@ -293,7 +302,7 @@ class TextFileParser(GenericFileParser): info = super().parse(full_path) with open(full_path, "rb") as text_file: - raw_content = text_file.read(self.content_lenght) + raw_content = text_file.read(self.content_length) chardet.detect(raw_content) encoding = chardet.detect(raw_content)["encoding"] @@ -321,7 +330,6 @@ class FontParser(GenericFileParser): def parse(self, full_path: str): info = super().parse(full_path) - print(info) with open(full_path, "rb") as f: @@ -336,7 +344,7 @@ class FontParser(GenericFileParser): try: for name in font["name"].names: if name.nameID == 4: - info["font_name"] = name.toUnicode("replace") + info["content"] = name.toUnicode("replace") break except AssertionError: print("Could not read font name for " + full_path) @@ -344,3 +352,100 @@ class FontParser(GenericFileParser): print("Could not read font for " + full_path) return info + + +class PdfFileParser(GenericFileParser): + is_default = False + + def __init__(self, checksum_calculators: list, content_length: int): + super().__init__(checksum_calculators) + + self.content_length = content_length + + self.mime_types = [ + "application/pdf", "application/x-pdf" + ] + + def parse(self, full_path: str): + info = super().parse(full_path) + + with open(full_path, "rb") as f: + + info["content"] = "" + + parser = PDFParser(f) + document = PDFDocument(parser) + + if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"": + info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n" + + try: + if document.is_extractable: + resource_manager = PDFResourceManager() + la_params = LAParams() + + device = PDFPageAggregator(resource_manager, laparams=la_params) + interpreter = PDFPageInterpreter(resource_manager, device) + + for page in PDFPage.create_pages(document): + + interpreter.process_page(page) + layout = device.get_result() + + for lt_obj in layout: + if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): + + text = lt_obj.get_text() + + if len(info["content"]) + len(text) <= self.content_length: + info["content"] += text + else: + info["content"] += text[0:self.content_length - len(info["content"])] + break + else: + continue + break + else: + print("PDF is not extractable: " + full_path) + except ValueError: + print("Couldn't parse page for " + full_path) + + return info + + +class EbookParser(GenericFileParser): + is_default = False + + def __init__(self, checksum_calculators: list, content_length: int): + super().__init__(checksum_calculators) + + self.content_length = content_length + + self.mime_types = [ + "application/epub+zip" + ] + + self.html2text = html2text.HTML2Text() + self.html2text.ignore_images = True + self.html2text.ignore_emphasis = True + + def parse(self, full_path: str): + info = super().parse(full_path) + + book = epub.read_epub(full_path) + + info["content"] = "" + + for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + + text = self.html2text.handle(text.content.decode("utf-8")) + + if len(info["content"]) + len(text) <= self.content_length: + info["content"] += text + else: + info["content"] += text[0:self.content_length - len(info["content"])] + break + + return info + + diff --git a/requirements.txt b/requirements.txt index 0799d8b..e1c101f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,6 @@ chardet fonttools brotli unicodedata2 -slate \ No newline at end of file +pdfminer.six +ebooklib +html2text \ No newline at end of file diff --git a/spec/EbookParserTest.py b/spec/EbookParserTest.py new file mode 100644 index 0000000..9550bf3 --- /dev/null +++ b/spec/EbookParserTest.py @@ -0,0 +1,13 @@ +from unittest import TestCase +from parsing import EbookParser + + +class EbookParserTest(TestCase): + + def test_parse_content(self): + + parser = EbookParser([], 1000) + + info = parser.parse("test_files/epub1.epub") + + self.assertEqual(len(info["content"]), 1000) diff --git a/spec/FontParser_spec.py b/spec/FontParser_spec.py index 3bc0ad7..19a576c 100644 --- a/spec/FontParser_spec.py +++ b/spec/FontParser_spec.py @@ -10,7 +10,7 @@ class FontParserTest(TestCase): info = parser.parse("test_files/truetype1.ttf") - self.assertEqual(info["font_name"], "Liberation Mono Bold") + self.assertEqual(info["content"], "Liberation Mono Bold") def test_parse_name_openType(self): @@ -18,7 +18,7 @@ class FontParserTest(TestCase): info = parser.parse("test_files/opentype1.otf") - self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O") + self.assertEqual(info["content"], "Linux Biolinum Keyboard O") def test_parse_name_woff(self): @@ -26,7 +26,7 @@ class FontParserTest(TestCase): info = parser.parse("test_files/woff.woff") - self.assertEqual(info["font_name"], "Heart of Gold") + self.assertEqual(info["content"], "Heart of Gold") def test_parse_name_woff2(self): @@ -34,4 +34,4 @@ class FontParserTest(TestCase): info = parser.parse("test_files/woff2.woff2") - self.assertEqual(info["font_name"], "Heart of Gold") + self.assertEqual(info["content"], "Heart of Gold") diff --git a/spec/PdfFileParser.py b/spec/PdfFileParser.py new file mode 100644 index 0000000..2e1d52b --- /dev/null +++ b/spec/PdfFileParser.py @@ -0,0 +1,14 @@ +from unittest import TestCase +from parsing import PdfFileParser + + +class PdfParserTest(TestCase): + + def test_parse_content(self): + + parser = PdfFileParser([], 12488) + + info = parser.parse("test_files/pdf1.pdf") + + self.assertEqual(len(info["content"]), 12488) + self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies")) diff --git a/spec/test_files/epub1.epub b/spec/test_files/epub1.epub new file mode 100644 index 0000000..e31d68c Binary files /dev/null and b/spec/test_files/epub1.epub differ diff --git a/spec/test_files/pdf1.pdf b/spec/test_files/pdf1.pdf new file mode 100644 index 0000000..22fb22e Binary files /dev/null and b/spec/test_files/pdf1.pdf differ diff --git a/tmp.py b/tmp.py index f744dd7..1e8ec0b 100644 --- a/tmp.py +++ b/tmp.py @@ -1,56 +1,127 @@ -from elasticsearch import Elasticsearch -from indexer import Indexer -import json -from crawler import Crawler -from indexer import Indexer -from parsing import GenericFileParser, Sha256CheckSumCalculator, ExtensionMimeGuesser +#!/usr/bin/env python -es = Elasticsearch() -1 - -# reset -es.indices.delete(index="test") -es.indices.create(index="test") -es.indices.close(index="test") +""" +Converts PDF text content (though not images containing text) to plain text, html, xml or "tags". +""" +import sys +import logging +import six +import pdfminer.settings +pdfminer.settings.STRICT = False +import pdfminer.high_level +import pdfminer.layout +from pdfminer.image import ImageWriter -# # config -es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {' - '"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {' - '"type": "path_hierarchy"}}}}', index="test") +def extract_text(files=[], outfile='-', + _py2_no_more_posargs=None, # Bloody Python2 needs a shim + no_laparams=False, all_texts=None, detect_vertical=None, # LAParams + word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams + output_type='text', codec='utf-8', strip_control=False, + maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, + layoutmode='normal', output_dir=None, debug=False, + disable_caching=False, **other): + if _py2_no_more_posargs is not None: + raise ValueError("Too many positional arguments passed.") + if not files: + raise ValueError("Must provide files to work upon!") -es.indices.put_mapping(body='{"properties": {' - '"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},' - '"suggest-path": {"type": "completion", "analyzer": "keyword"},' - '"mime": {"type": "keyword"}' - '}}', index="test",doc_type="file" ) + # If any LAParams group arguments were passed, create an LAParams object and + # populate with given args. Otherwise, set it to None. + if not no_laparams: + laparams = pdfminer.layout.LAParams() + for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"): + paramv = locals().get(param, None) + if paramv is not None: + setattr(laparams, param, paramv) + else: + laparams = None -es.indices.open(index="test") + imagewriter = None + if output_dir: + imagewriter = ImageWriter(output_dir) + + if output_type == "text" and outfile != "-": + for override, alttype in ( (".htm", "html"), + (".html", "html"), + (".xml", "xml"), + (".tag", "tag") ): + if outfile.endswith(override): + output_type = alttype + + if outfile == "-": + outfp = sys.stdout + if outfp.encoding is not None: + codec = 'utf-8' + else: + outfp = open(outfile, "wb") -# add docs + for fname in files: + with open(fname, "rb") as fp: + pdfminer.high_level.extract_text_to_fp(fp, **locals()) + return outfp -# crawler = Crawler([GenericFileParser([Sha256CheckSumCalculator()], ExtensionMimeGuesser())]) -# crawler.crawl("spec/test_folder") -# -# indexer = Indexer("test") -# -# indexer.index(crawler.documents) +# main +def main(args=None): + import argparse + P = argparse.ArgumentParser(description=__doc__) + P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.") + P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.") + P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.") + P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.") + P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse") + P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF") + P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)") + P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)") + P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding") + P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale") + P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts") + P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical") + P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin") + P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin") + P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin") + P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow") + P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode") + P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams") + P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation") + P.add_argument("-O", "--output-dir", default=None, help="Output directory for images") + P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching") + P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode") + A = P.parse_args(args=args) -# search -# print(es.search("test", "file", '{"query": {"term": {"name": "spec/test_folder/sub2/"}}}')) -# print(es.search("test", "file", '{"query": {"match_all": {}}, "aggs": {"test": {"terms": {"field": "mime"}}}}')) -# suggest = es.search("test", "file", '{"suggest": {"path-suggest": {"prefix": "spec/test_folder/sub", "completion": {"field": "suggest-path"}}}}') -# -# print(suggest["suggest"]["path-suggest"]) -# -# for hit in suggest["suggest"]["path-suggest"][0]["options"]: -# print(hit["text"]) + if A.page_numbers: + A.page_numbers = set([x-1 for x in A.page_numbers]) + if A.pagenos: + A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")]) -# indexer = Indexer("test") + imagewriter = None + if A.output_dir: + imagewriter = ImageWriter(A.output_dir) -# import time -# time.sleep(10) + if six.PY2 and sys.stdin.encoding: + A.password = A.password.decode(sys.stdin.encoding) -c = Crawler([]) -c.countFiles("/") + if A.output_type == "text" and A.outfile != "-": + for override, alttype in ( (".htm", "html"), + (".html", "html"), + (".xml", "xml" ), + (".tag", "tag" ) ): + if A.outfile.endswith(override): + A.output_type = alttype + + if A.outfile == "-": + outfp = sys.stdout + if outfp.encoding is not None: + # Why ignore outfp.encoding? :-/ stupid cathal? + A.codec = 'utf-8' + else: + outfp = open(A.outfile, "wb") + + ## Test Code + outfp = extract_text(**vars(A)) + outfp.close() + return 0 + + +if __name__ == '__main__': sys.exit(main())