Added pdf & epub parsing

2025-10-22 11:36:53 +00:00 · 2018-04-16 19:42:40 -04:00 · 2018-04-16 19:42:40 -04:00 · 17c682a5ef
commit 17c682a5ef
parent 6d3cceb1b1
11 changed files with 264 additions and 57 deletions
--- a/config.py
+++ b/config.py
@ -3,6 +3,7 @@ default_options = {
    "ThumbnailSize": "275",
    "ThumbnailColor": "FF00FF",
    "TextFileContentLength": "8192",
    "PdfFileContentLength": "8192",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
    "FileParsers": "media, text, picture, font"  # media, text, picture
--- a/crawler.py
+++ b/crawler.py
@ -4,7 +4,8 @@ import json
 from multiprocessing import Process, Value
 from apscheduler.schedulers.background import BackgroundScheduler
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
-    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
+    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
    PdfFileParser
 from indexer import Indexer
 from search import Search
 from thumbnail import ThumbnailGenerator
@ -138,7 +139,8 @@ class TaskManager:
                     MediaFileParser(chksum_calcs),
                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
                     PictureFileParser(chksum_calcs),
-                     FontParser(chksum_calcs)],
+                     FontParser(chksum_calcs),
                     PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)
--- a/indexer.py
+++ b/indexer.py
@ -87,7 +87,6 @@ class Indexer:
            "title": {"analyzer": "my_nGram", "type": "text"},
            "genre": {"analyzer": "my_nGram", "type": "text"},
            "album_artist": {"analyzer": "my_nGram", "type": "text"},
            "font_name": {"analyzer": "my_nGram", "type": "text"},
        }}, doc_type="file", index=self.index_name)
        self.es.indices.open(index=self.index_name)
--- a/parsing.py
+++ b/parsing.py
@ -7,6 +7,15 @@ import json
 import chardet
 import html
 import warnings
 from pdfminer.pdfparser import PDFParser
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.layout import LAParams, LTTextBox, LTTextLine
 from pdfminer.converter import PDFPageAggregator
 import html2text
 from ebooklib import epub
 import ebooklib
 from PIL import Image
 from fontTools.ttLib import TTFont, TTLibError
@ -242,9 +251,9 @@ class PictureFileParser(GenericFileParser):
 class TextFileParser(GenericFileParser):
    is_default = False
-    def __init__(self, checksum_calculators: list, content_lenght: int):
+    def __init__(self, checksum_calculators: list, content_length: int):
        super().__init__(checksum_calculators)
-        self.content_lenght = content_lenght
+        self.content_length = content_length
        self.mime_types = [
            "text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
@ -293,7 +302,7 @@ class TextFileParser(GenericFileParser):
        info = super().parse(full_path)
        with open(full_path, "rb") as text_file:
-            raw_content = text_file.read(self.content_lenght)
+            raw_content = text_file.read(self.content_length)
            chardet.detect(raw_content)
            encoding = chardet.detect(raw_content)["encoding"]
@ -321,7 +330,6 @@ class FontParser(GenericFileParser):
    def parse(self, full_path: str):
        info = super().parse(full_path)
        print(info)
        with open(full_path, "rb") as f:
@ -336,7 +344,7 @@ class FontParser(GenericFileParser):
                        try:
                            for name in font["name"].names:
                                if name.nameID == 4:
-                                    info["font_name"] = name.toUnicode("replace")
+                                    info["content"] = name.toUnicode("replace")
                                    break
                        except AssertionError:
                            print("Could not read font name for " + full_path)
@ -344,3 +352,100 @@ class FontParser(GenericFileParser):
                    print("Could not read font for " + full_path)
        return info
 class PdfFileParser(GenericFileParser):
    is_default = False
    def __init__(self, checksum_calculators: list, content_length: int):
        super().__init__(checksum_calculators)
        self.content_length = content_length
        self.mime_types = [
            "application/pdf", "application/x-pdf"
        ]
    def parse(self, full_path: str):
        info = super().parse(full_path)
        with open(full_path, "rb") as f:
            info["content"] = ""
            parser = PDFParser(f)
            document = PDFDocument(parser)
            if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
                info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
            try:
                if document.is_extractable:
                    resource_manager = PDFResourceManager()
                    la_params = LAParams()
                    device = PDFPageAggregator(resource_manager, laparams=la_params)
                    interpreter = PDFPageInterpreter(resource_manager, device)
                    for page in PDFPage.create_pages(document):
                        interpreter.process_page(page)
                        layout = device.get_result()
                        for lt_obj in layout:
                            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                                text = lt_obj.get_text()
                                if len(info["content"]) + len(text) <= self.content_length:
                                    info["content"] += text
                                else:
                                    info["content"] += text[0:self.content_length - len(info["content"])]
                                    break
                        else:
                            continue
                        break
                else:
                    print("PDF is not extractable: " + full_path)
            except ValueError:
                print("Couldn't parse page for " + full_path)
        return info
 class EbookParser(GenericFileParser):
    is_default = False
    def __init__(self, checksum_calculators: list, content_length: int):
        super().__init__(checksum_calculators)
        self.content_length = content_length
        self.mime_types = [
            "application/epub+zip"
        ]
        self.html2text = html2text.HTML2Text()
        self.html2text.ignore_images = True
        self.html2text.ignore_emphasis = True
    def parse(self, full_path: str):
        info = super().parse(full_path)
        book = epub.read_epub(full_path)
        info["content"] = ""
        for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
            text = self.html2text.handle(text.content.decode("utf-8"))
            if len(info["content"]) + len(text) <= self.content_length:
                info["content"] += text
            else:
                info["content"] += text[0:self.content_length - len(info["content"])]
                break
        return info
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,6 @@ chardet
 fonttools
 brotli
 unicodedata2
-slate
+pdfminer.six
 ebooklib
 html2text
--- a/spec/EbookParserTest.py
+++ b/spec/EbookParserTest.py
@ -0,0 +1,13 @@
 from unittest import TestCase
 from parsing import EbookParser
 class EbookParserTest(TestCase):
    def test_parse_content(self):
        parser = EbookParser([], 1000)
        info = parser.parse("test_files/epub1.epub")
        self.assertEqual(len(info["content"]), 1000)
--- a/spec/FontParser_spec.py
+++ b/spec/FontParser_spec.py
@ -10,7 +10,7 @@ class FontParserTest(TestCase):
        info = parser.parse("test_files/truetype1.ttf")
-        self.assertEqual(info["font_name"], "Liberation Mono Bold")
+        self.assertEqual(info["content"], "Liberation Mono Bold")
    def test_parse_name_openType(self):
@ -18,7 +18,7 @@ class FontParserTest(TestCase):
        info = parser.parse("test_files/opentype1.otf")
-        self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
+        self.assertEqual(info["content"], "Linux Biolinum Keyboard O")
    def test_parse_name_woff(self):
@ -26,7 +26,7 @@ class FontParserTest(TestCase):
        info = parser.parse("test_files/woff.woff")
-        self.assertEqual(info["font_name"], "Heart of Gold")
+        self.assertEqual(info["content"], "Heart of Gold")
    def test_parse_name_woff2(self):
@ -34,4 +34,4 @@ class FontParserTest(TestCase):
        info = parser.parse("test_files/woff2.woff2")
-        self.assertEqual(info["font_name"], "Heart of Gold")
+        self.assertEqual(info["content"], "Heart of Gold")
--- a/spec/PdfFileParser.py
+++ b/spec/PdfFileParser.py
@ -0,0 +1,14 @@
 from unittest import TestCase
 from parsing import PdfFileParser
 class PdfParserTest(TestCase):
    def test_parse_content(self):
        parser = PdfFileParser([], 12488)
        info = parser.parse("test_files/pdf1.pdf")
        self.assertEqual(len(info["content"]), 12488)
        self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
--- a/spec/test_files/epub1.epub
+++ b/spec/test_files/epub1.epub
--- a/spec/test_files/pdf1.pdf
+++ b/spec/test_files/pdf1.pdf
--- a/tmp.py
+++ b/tmp.py
@ -1,56 +1,127 @@
-from elasticsearch import Elasticsearch
+#!/usr/bin/env python
 from indexer import Indexer
 import json
 from crawler import Crawler
 from indexer import Indexer
 from parsing import GenericFileParser, Sha256CheckSumCalculator, ExtensionMimeGuesser
-es = Elasticsearch()
+"""
-1
+Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
-
+"""
-# reset
+import sys
-es.indices.delete(index="test")
+import logging
-es.indices.create(index="test")
+import six
-es.indices.close(index="test")
+import pdfminer.settings
 pdfminer.settings.STRICT = False
 import pdfminer.high_level
 import pdfminer.layout
 from pdfminer.image import ImageWriter
-# # config
+def extract_text(files=[], outfile='-',
-es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {'
+            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
-                             '"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {'
+            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
-                             '"type": "path_hierarchy"}}}}', index="test")
+            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
            output_type='text', codec='utf-8', strip_control=False,
            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
            layoutmode='normal', output_dir=None, debug=False,
            disable_caching=False, **other):
    if _py2_no_more_posargs is not None:
        raise ValueError("Too many positional arguments passed.")
    if not files:
        raise ValueError("Must provide files to work upon!")
-es.indices.put_mapping(body='{"properties": {'
+    # If any LAParams group arguments were passed, create an LAParams object and
-                            '"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},'
+    # populate with given args. Otherwise, set it to None.
-                            '"suggest-path": {"type": "completion", "analyzer": "keyword"},'
+    if not no_laparams:
-                            '"mime": {"type": "keyword"}'
+        laparams = pdfminer.layout.LAParams()
-                            '}}', index="test",doc_type="file" )
+        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)
    else:
        laparams = None
-es.indices.open(index="test")
+    imagewriter = None
    if output_dir:
        imagewriter = ImageWriter(output_dir)
    if output_type == "text" and outfile != "-":
        for override, alttype in (  (".htm", "html"),
                                    (".html", "html"),
                                    (".xml", "xml"),
                                    (".tag", "tag") ):
            if outfile.endswith(override):
                output_type = alttype
    if outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = 'utf-8'
    else:
        outfp = open(outfile, "wb")
-# add docs
+    for fname in files:
        with open(fname, "rb") as fp:
            pdfminer.high_level.extract_text_to_fp(fp, **locals())
    return outfp
-# crawler = Crawler([GenericFileParser([Sha256CheckSumCalculator()], ExtensionMimeGuesser())])
+# main
-# crawler.crawl("spec/test_folder")
+def main(args=None):
-#
+    import argparse
-# indexer = Indexer("test")
+    P = argparse.ArgumentParser(description=__doc__)
-#
+    P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
-# indexer.index(crawler.documents)
+    P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
    P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
    P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
    P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
    P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
    P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
    P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
    P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
    P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
    P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
    P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
    P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
    P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
    P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
    P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
    P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
    P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
    P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
    A = P.parse_args(args=args)
-# search
+    if A.page_numbers:
-# print(es.search("test", "file", '{"query": {"term": {"name": "spec/test_folder/sub2/"}}}'))
+        A.page_numbers = set([x-1 for x in A.page_numbers])
-# print(es.search("test", "file", '{"query": {"match_all": {}}, "aggs": {"test": {"terms": {"field": "mime"}}}}'))
+    if A.pagenos:
-# suggest = es.search("test", "file", '{"suggest": {"path-suggest": {"prefix": "spec/test_folder/sub", "completion": {"field": "suggest-path"}}}}')
+        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
 #
 # print(suggest["suggest"]["path-suggest"])
 #
 # for hit in suggest["suggest"]["path-suggest"][0]["options"]:
 #     print(hit["text"])
-# indexer = Indexer("test")
+    imagewriter = None
    if A.output_dir:
        imagewriter = ImageWriter(A.output_dir)
-# import time
+    if six.PY2 and sys.stdin.encoding:
-# time.sleep(10)
+        A.password = A.password.decode(sys.stdin.encoding)
-c = Crawler([])
+    if A.output_type == "text" and A.outfile != "-":
-c.countFiles("/")
+        for override, alttype in (  (".htm",  "html"),
                                    (".html", "html"),
                                    (".xml",  "xml" ),
                                    (".tag",  "tag" ) ):
            if A.outfile.endswith(override):
                A.output_type = alttype
    if A.outfile == "-":
        outfp = sys.stdout
        if outfp.encoding is not None:
            # Why ignore outfp.encoding? :-/ stupid cathal?
            A.codec = 'utf-8'
    else:
        outfp = open(A.outfile, "wb")
    ## Test Code
    outfp = extract_text(**vars(A))
    outfp.close()
    return 0
 if __name__ == '__main__': sys.exit(main())