Added pdf & epub parsing

2025-04-24 12:35:51 +00:00 · 2018-04-16 19:42:40 -04:00 · 2018-04-16 19:42:40 -04:00 · 17c682a5ef
commit 17c682a5ef
parent 6d3cceb1b1
11 changed files with 264 additions and 57 deletions
--- a/config.py
+++ b/config.py
@ -3,6 +3,7 @@ default_options = {
    "ThumbnailSize": "275",
    "ThumbnailColor": "FF00FF",
    "TextFileContentLength": "8192",
+    "PdfFileContentLength": "8192",
    "MimeGuesser": "extension",  # extension, content
    "CheckSumCalculators": "",  # md5, sha1, sha256
    "FileParsers": "media, text, picture, font"  # media, text, picture
--- a/crawler.py
+++ b/crawler.py
@ -4,7 +4,8 @@ import json
 from multiprocessing import Process, Value
 from apscheduler.schedulers.background import BackgroundScheduler
 from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
-    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
+    PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
+    PdfFileParser
 from indexer import Indexer
 from search import Search
 from thumbnail import ThumbnailGenerator
@ -138,7 +139,8 @@ class TaskManager:
                     MediaFileParser(chksum_calcs),
                     TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
                     PictureFileParser(chksum_calcs),
-                     FontParser(chksum_calcs)],
+                     FontParser(chksum_calcs),
+                     PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))],  # todo get content len from other opt
                    mime_guesser, self.indexer, directory.id)
        c.crawl(directory.path, counter)

--- a/indexer.py
+++ b/indexer.py
@ -87,7 +87,6 @@ class Indexer:
            "title": {"analyzer": "my_nGram", "type": "text"},
            "genre": {"analyzer": "my_nGram", "type": "text"},
            "album_artist": {"analyzer": "my_nGram", "type": "text"},
-            "font_name": {"analyzer": "my_nGram", "type": "text"},
        }}, doc_type="file", index=self.index_name)

        self.es.indices.open(index=self.index_name)
--- a/parsing.py
+++ b/parsing.py
@ -7,6 +7,15 @@ import json
 import chardet
 import html
 import warnings
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.layout import LAParams, LTTextBox, LTTextLine
+from pdfminer.converter import PDFPageAggregator
+import html2text
+from ebooklib import epub
+import ebooklib
 from PIL import Image
 from fontTools.ttLib import TTFont, TTLibError

@ -242,9 +251,9 @@ class PictureFileParser(GenericFileParser):
 class TextFileParser(GenericFileParser):
    is_default = False

-    def __init__(self, checksum_calculators: list, content_lenght: int):
+    def __init__(self, checksum_calculators: list, content_length: int):
        super().__init__(checksum_calculators)
-        self.content_lenght = content_lenght
+        self.content_length = content_length

        self.mime_types = [
            "text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
@ -293,7 +302,7 @@ class TextFileParser(GenericFileParser):
        info = super().parse(full_path)

        with open(full_path, "rb") as text_file:
-            raw_content = text_file.read(self.content_lenght)
+            raw_content = text_file.read(self.content_length)

            chardet.detect(raw_content)
            encoding = chardet.detect(raw_content)["encoding"]
@ -321,7 +330,6 @@ class FontParser(GenericFileParser):
    def parse(self, full_path: str):

        info = super().parse(full_path)
-        print(info)

        with open(full_path, "rb") as f:

@ -336,7 +344,7 @@ class FontParser(GenericFileParser):
                        try:
                            for name in font["name"].names:
                                if name.nameID == 4:
-                                    info["font_name"] = name.toUnicode("replace")
+                                    info["content"] = name.toUnicode("replace")
                                    break
                        except AssertionError:
                            print("Could not read font name for " + full_path)
@ -344,3 +352,100 @@ class FontParser(GenericFileParser):
                    print("Could not read font for " + full_path)

        return info
+
+
+class PdfFileParser(GenericFileParser):
+    is_default = False
+
+    def __init__(self, checksum_calculators: list, content_length: int):
+        super().__init__(checksum_calculators)
+
+        self.content_length = content_length
+
+        self.mime_types = [
+            "application/pdf", "application/x-pdf"
+        ]
+
+    def parse(self, full_path: str):
+        info = super().parse(full_path)
+
+        with open(full_path, "rb") as f:
+
+            info["content"] = ""
+
+            parser = PDFParser(f)
+            document = PDFDocument(parser)
+
+            if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
+                info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
+
+            try:
+                if document.is_extractable:
+                    resource_manager = PDFResourceManager()
+                    la_params = LAParams()
+
+                    device = PDFPageAggregator(resource_manager, laparams=la_params)
+                    interpreter = PDFPageInterpreter(resource_manager, device)
+
+                    for page in PDFPage.create_pages(document):
+
+                        interpreter.process_page(page)
+                        layout = device.get_result()
+
+                        for lt_obj in layout:
+                            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
+
+                                text = lt_obj.get_text()
+
+                                if len(info["content"]) + len(text) <= self.content_length:
+                                    info["content"] += text
+                                else:
+                                    info["content"] += text[0:self.content_length - len(info["content"])]
+                                    break
+                        else:
+                            continue
+                        break
+                else:
+                    print("PDF is not extractable: " + full_path)
+            except ValueError:
+                print("Couldn't parse page for " + full_path)
+
+        return info
+
+
+class EbookParser(GenericFileParser):
+    is_default = False
+
+    def __init__(self, checksum_calculators: list, content_length: int):
+        super().__init__(checksum_calculators)
+
+        self.content_length = content_length
+
+        self.mime_types = [
+            "application/epub+zip"
+        ]
+
+        self.html2text = html2text.HTML2Text()
+        self.html2text.ignore_images = True
+        self.html2text.ignore_emphasis = True
+
+    def parse(self, full_path: str):
+        info = super().parse(full_path)
+
+        book = epub.read_epub(full_path)
+
+        info["content"] = ""
+
+        for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+
+            text = self.html2text.handle(text.content.decode("utf-8"))
+
+            if len(info["content"]) + len(text) <= self.content_length:
+                info["content"] += text
+            else:
+                info["content"] += text[0:self.content_length - len(info["content"])]
+                break
+
+        return info
+
+
--- a/requirements.txt
+++ b/requirements.txt
@ -10,4 +10,6 @@ chardet
 fonttools
 brotli
 unicodedata2
-slate
+pdfminer.six
+ebooklib
+html2text
--- a/spec/EbookParserTest.py
+++ b/spec/EbookParserTest.py
@ -0,0 +1,13 @@
+from unittest import TestCase
+from parsing import EbookParser
+
+
+class EbookParserTest(TestCase):
+
+    def test_parse_content(self):
+
+        parser = EbookParser([], 1000)
+
+        info = parser.parse("test_files/epub1.epub")
+
+        self.assertEqual(len(info["content"]), 1000)
--- a/spec/FontParser_spec.py
+++ b/spec/FontParser_spec.py
@ -10,7 +10,7 @@ class FontParserTest(TestCase):

        info = parser.parse("test_files/truetype1.ttf")

-        self.assertEqual(info["font_name"], "Liberation Mono Bold")
+        self.assertEqual(info["content"], "Liberation Mono Bold")

    def test_parse_name_openType(self):

@ -18,7 +18,7 @@ class FontParserTest(TestCase):

        info = parser.parse("test_files/opentype1.otf")

-        self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
+        self.assertEqual(info["content"], "Linux Biolinum Keyboard O")

    def test_parse_name_woff(self):

@ -26,7 +26,7 @@ class FontParserTest(TestCase):

        info = parser.parse("test_files/woff.woff")

-        self.assertEqual(info["font_name"], "Heart of Gold")
+        self.assertEqual(info["content"], "Heart of Gold")

    def test_parse_name_woff2(self):

@ -34,4 +34,4 @@ class FontParserTest(TestCase):

        info = parser.parse("test_files/woff2.woff2")

-        self.assertEqual(info["font_name"], "Heart of Gold")
+        self.assertEqual(info["content"], "Heart of Gold")
--- a/spec/PdfFileParser.py
+++ b/spec/PdfFileParser.py
@ -0,0 +1,14 @@
+from unittest import TestCase
+from parsing import PdfFileParser
+
+
+class PdfParserTest(TestCase):
+
+    def test_parse_content(self):
+
+        parser = PdfFileParser([], 12488)
+
+        info = parser.parse("test_files/pdf1.pdf")
+
+        self.assertEqual(len(info["content"]), 12488)
+        self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
--- a/spec/test_files/epub1.epub
+++ b/spec/test_files/epub1.epub
--- a/spec/test_files/pdf1.pdf
+++ b/spec/test_files/pdf1.pdf
--- a/tmp.py
+++ b/tmp.py
@ -1,56 +1,127 @@
-from elasticsearch import Elasticsearch
-from indexer import Indexer
-import json
-from crawler import Crawler
-from indexer import Indexer
-from parsing import GenericFileParser, Sha256CheckSumCalculator, ExtensionMimeGuesser
+#!/usr/bin/env python

-es = Elasticsearch()
-1
-
-# reset
-es.indices.delete(index="test")
-es.indices.create(index="test")
-es.indices.close(index="test")
+"""
+Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
+"""
+import sys
+import logging
+import six
+import pdfminer.settings
+pdfminer.settings.STRICT = False
+import pdfminer.high_level
+import pdfminer.layout
+from pdfminer.image import ImageWriter


-# # config
-es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {'
-                             '"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {'
-                             '"type": "path_hierarchy"}}}}', index="test")
+def extract_text(files=[], outfile='-',
+            _py2_no_more_posargs=None,  # Bloody Python2 needs a shim
+            no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
+            word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
+            output_type='text', codec='utf-8', strip_control=False,
+            maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
+            layoutmode='normal', output_dir=None, debug=False,
+            disable_caching=False, **other):
+    if _py2_no_more_posargs is not None:
+        raise ValueError("Too many positional arguments passed.")
+    if not files:
+        raise ValueError("Must provide files to work upon!")

-es.indices.put_mapping(body='{"properties": {'
-                            '"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},'
-                            '"suggest-path": {"type": "completion", "analyzer": "keyword"},'
-                            '"mime": {"type": "keyword"}'
-                            '}}', index="test",doc_type="file" )
+    # If any LAParams group arguments were passed, create an LAParams object and
+    # populate with given args. Otherwise, set it to None.
+    if not no_laparams:
+        laparams = pdfminer.layout.LAParams()
+        for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
+            paramv = locals().get(param, None)
+            if paramv is not None:
+                setattr(laparams, param, paramv)
+    else:
+        laparams = None

-es.indices.open(index="test")
+    imagewriter = None
+    if output_dir:
+        imagewriter = ImageWriter(output_dir)
+
+    if output_type == "text" and outfile != "-":
+        for override, alttype in (  (".htm", "html"),
+                                    (".html", "html"),
+                                    (".xml", "xml"),
+                                    (".tag", "tag") ):
+            if outfile.endswith(override):
+                output_type = alttype
+
+    if outfile == "-":
+        outfp = sys.stdout
+        if outfp.encoding is not None:
+            codec = 'utf-8'
+    else:
+        outfp = open(outfile, "wb")


-# add docs
+    for fname in files:
+        with open(fname, "rb") as fp:
+            pdfminer.high_level.extract_text_to_fp(fp, **locals())
+    return outfp

-# crawler = Crawler([GenericFileParser([Sha256CheckSumCalculator()], ExtensionMimeGuesser())])
-# crawler.crawl("spec/test_folder")
-#
-# indexer = Indexer("test")
-#
-# indexer.index(crawler.documents)
+# main
+def main(args=None):
+    import argparse
+    P = argparse.ArgumentParser(description=__doc__)
+    P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
+    P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
+    P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
+    P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
+    P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
+    P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
+    P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
+    P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
+    P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
+    P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
+    P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
+    P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
+    P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
+    P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
+    P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
+    P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
+    P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
+    P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
+    P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
+    P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
+    P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
+    P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
+    A = P.parse_args(args=args)

-# search
-# print(es.search("test", "file", '{"query": {"term": {"name": "spec/test_folder/sub2/"}}}'))
-# print(es.search("test", "file", '{"query": {"match_all": {}}, "aggs": {"test": {"terms": {"field": "mime"}}}}'))
-# suggest = es.search("test", "file", '{"suggest": {"path-suggest": {"prefix": "spec/test_folder/sub", "completion": {"field": "suggest-path"}}}}')
-#
-# print(suggest["suggest"]["path-suggest"])
-#
-# for hit in suggest["suggest"]["path-suggest"][0]["options"]:
-#     print(hit["text"])
+    if A.page_numbers:
+        A.page_numbers = set([x-1 for x in A.page_numbers])
+    if A.pagenos:
+        A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])

-# indexer = Indexer("test")
+    imagewriter = None
+    if A.output_dir:
+        imagewriter = ImageWriter(A.output_dir)

-# import time
-# time.sleep(10)
+    if six.PY2 and sys.stdin.encoding:
+        A.password = A.password.decode(sys.stdin.encoding)

-c = Crawler([])
-c.countFiles("/")
+    if A.output_type == "text" and A.outfile != "-":
+        for override, alttype in (  (".htm",  "html"),
+                                    (".html", "html"),
+                                    (".xml",  "xml" ),
+                                    (".tag",  "tag" ) ):
+            if A.outfile.endswith(override):
+                A.output_type = alttype
+
+    if A.outfile == "-":
+        outfp = sys.stdout
+        if outfp.encoding is not None:
+            # Why ignore outfp.encoding? :-/ stupid cathal?
+            A.codec = 'utf-8'
+    else:
+        outfp = open(A.outfile, "wb")
+
+    ## Test Code
+    outfp = extract_text(**vars(A))
+    outfp.close()
+    return 0
+
+
+if __name__ == '__main__': sys.exit(main())