mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-04-19 18:16:45 +00:00
Added pdf & epub parsing
This commit is contained in:
parent
6d3cceb1b1
commit
17c682a5ef
@ -3,6 +3,7 @@ default_options = {
|
|||||||
"ThumbnailSize": "275",
|
"ThumbnailSize": "275",
|
||||||
"ThumbnailColor": "FF00FF",
|
"ThumbnailColor": "FF00FF",
|
||||||
"TextFileContentLength": "8192",
|
"TextFileContentLength": "8192",
|
||||||
|
"PdfFileContentLength": "8192",
|
||||||
"MimeGuesser": "extension", # extension, content
|
"MimeGuesser": "extension", # extension, content
|
||||||
"CheckSumCalculators": "", # md5, sha1, sha256
|
"CheckSumCalculators": "", # md5, sha1, sha256
|
||||||
"FileParsers": "media, text, picture, font" # media, text, picture
|
"FileParsers": "media, text, picture, font" # media, text, picture
|
||||||
|
@ -4,7 +4,8 @@ import json
|
|||||||
from multiprocessing import Process, Value
|
from multiprocessing import Process, Value
|
||||||
from apscheduler.schedulers.background import BackgroundScheduler
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
from parsing import GenericFileParser, Md5CheckSumCalculator, ExtensionMimeGuesser, MediaFileParser, TextFileParser, \
|
||||||
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser
|
PictureFileParser, Sha1CheckSumCalculator, Sha256CheckSumCalculator, ContentMimeGuesser, MimeGuesser, FontParser, \
|
||||||
|
PdfFileParser
|
||||||
from indexer import Indexer
|
from indexer import Indexer
|
||||||
from search import Search
|
from search import Search
|
||||||
from thumbnail import ThumbnailGenerator
|
from thumbnail import ThumbnailGenerator
|
||||||
@ -138,7 +139,8 @@ class TaskManager:
|
|||||||
MediaFileParser(chksum_calcs),
|
MediaFileParser(chksum_calcs),
|
||||||
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
TextFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength"))),
|
||||||
PictureFileParser(chksum_calcs),
|
PictureFileParser(chksum_calcs),
|
||||||
FontParser(chksum_calcs)],
|
FontParser(chksum_calcs),
|
||||||
|
PdfFileParser(chksum_calcs, int(directory.get_option("TextFileContentLength")))], # todo get content len from other opt
|
||||||
mime_guesser, self.indexer, directory.id)
|
mime_guesser, self.indexer, directory.id)
|
||||||
c.crawl(directory.path, counter)
|
c.crawl(directory.path, counter)
|
||||||
|
|
||||||
|
@ -87,7 +87,6 @@ class Indexer:
|
|||||||
"title": {"analyzer": "my_nGram", "type": "text"},
|
"title": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"genre": {"analyzer": "my_nGram", "type": "text"},
|
"genre": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
"album_artist": {"analyzer": "my_nGram", "type": "text"},
|
||||||
"font_name": {"analyzer": "my_nGram", "type": "text"},
|
|
||||||
}}, doc_type="file", index=self.index_name)
|
}}, doc_type="file", index=self.index_name)
|
||||||
|
|
||||||
self.es.indices.open(index=self.index_name)
|
self.es.indices.open(index=self.index_name)
|
||||||
|
115
parsing.py
115
parsing.py
@ -7,6 +7,15 @@ import json
|
|||||||
import chardet
|
import chardet
|
||||||
import html
|
import html
|
||||||
import warnings
|
import warnings
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
import html2text
|
||||||
|
from ebooklib import epub
|
||||||
|
import ebooklib
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from fontTools.ttLib import TTFont, TTLibError
|
from fontTools.ttLib import TTFont, TTLibError
|
||||||
|
|
||||||
@ -242,9 +251,9 @@ class PictureFileParser(GenericFileParser):
|
|||||||
class TextFileParser(GenericFileParser):
|
class TextFileParser(GenericFileParser):
|
||||||
is_default = False
|
is_default = False
|
||||||
|
|
||||||
def __init__(self, checksum_calculators: list, content_lenght: int):
|
def __init__(self, checksum_calculators: list, content_length: int):
|
||||||
super().__init__(checksum_calculators)
|
super().__init__(checksum_calculators)
|
||||||
self.content_lenght = content_lenght
|
self.content_length = content_length
|
||||||
|
|
||||||
self.mime_types = [
|
self.mime_types = [
|
||||||
"text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
|
"text/asp", "text/css", "text/ecmascript", "text/html", "text/javascript",
|
||||||
@ -293,7 +302,7 @@ class TextFileParser(GenericFileParser):
|
|||||||
info = super().parse(full_path)
|
info = super().parse(full_path)
|
||||||
|
|
||||||
with open(full_path, "rb") as text_file:
|
with open(full_path, "rb") as text_file:
|
||||||
raw_content = text_file.read(self.content_lenght)
|
raw_content = text_file.read(self.content_length)
|
||||||
|
|
||||||
chardet.detect(raw_content)
|
chardet.detect(raw_content)
|
||||||
encoding = chardet.detect(raw_content)["encoding"]
|
encoding = chardet.detect(raw_content)["encoding"]
|
||||||
@ -321,7 +330,6 @@ class FontParser(GenericFileParser):
|
|||||||
def parse(self, full_path: str):
|
def parse(self, full_path: str):
|
||||||
|
|
||||||
info = super().parse(full_path)
|
info = super().parse(full_path)
|
||||||
print(info)
|
|
||||||
|
|
||||||
with open(full_path, "rb") as f:
|
with open(full_path, "rb") as f:
|
||||||
|
|
||||||
@ -336,7 +344,7 @@ class FontParser(GenericFileParser):
|
|||||||
try:
|
try:
|
||||||
for name in font["name"].names:
|
for name in font["name"].names:
|
||||||
if name.nameID == 4:
|
if name.nameID == 4:
|
||||||
info["font_name"] = name.toUnicode("replace")
|
info["content"] = name.toUnicode("replace")
|
||||||
break
|
break
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
print("Could not read font name for " + full_path)
|
print("Could not read font name for " + full_path)
|
||||||
@ -344,3 +352,100 @@ class FontParser(GenericFileParser):
|
|||||||
print("Could not read font for " + full_path)
|
print("Could not read font for " + full_path)
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
class PdfFileParser(GenericFileParser):
|
||||||
|
is_default = False
|
||||||
|
|
||||||
|
def __init__(self, checksum_calculators: list, content_length: int):
|
||||||
|
super().__init__(checksum_calculators)
|
||||||
|
|
||||||
|
self.content_length = content_length
|
||||||
|
|
||||||
|
self.mime_types = [
|
||||||
|
"application/pdf", "application/x-pdf"
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, full_path: str):
|
||||||
|
info = super().parse(full_path)
|
||||||
|
|
||||||
|
with open(full_path, "rb") as f:
|
||||||
|
|
||||||
|
info["content"] = ""
|
||||||
|
|
||||||
|
parser = PDFParser(f)
|
||||||
|
document = PDFDocument(parser)
|
||||||
|
|
||||||
|
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||||
|
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||||
|
|
||||||
|
try:
|
||||||
|
if document.is_extractable:
|
||||||
|
resource_manager = PDFResourceManager()
|
||||||
|
la_params = LAParams()
|
||||||
|
|
||||||
|
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||||
|
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||||
|
|
||||||
|
for page in PDFPage.create_pages(document):
|
||||||
|
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
|
||||||
|
for lt_obj in layout:
|
||||||
|
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||||
|
|
||||||
|
text = lt_obj.get_text()
|
||||||
|
|
||||||
|
if len(info["content"]) + len(text) <= self.content_length:
|
||||||
|
info["content"] += text
|
||||||
|
else:
|
||||||
|
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print("PDF is not extractable: " + full_path)
|
||||||
|
except ValueError:
|
||||||
|
print("Couldn't parse page for " + full_path)
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
class EbookParser(GenericFileParser):
|
||||||
|
is_default = False
|
||||||
|
|
||||||
|
def __init__(self, checksum_calculators: list, content_length: int):
|
||||||
|
super().__init__(checksum_calculators)
|
||||||
|
|
||||||
|
self.content_length = content_length
|
||||||
|
|
||||||
|
self.mime_types = [
|
||||||
|
"application/epub+zip"
|
||||||
|
]
|
||||||
|
|
||||||
|
self.html2text = html2text.HTML2Text()
|
||||||
|
self.html2text.ignore_images = True
|
||||||
|
self.html2text.ignore_emphasis = True
|
||||||
|
|
||||||
|
def parse(self, full_path: str):
|
||||||
|
info = super().parse(full_path)
|
||||||
|
|
||||||
|
book = epub.read_epub(full_path)
|
||||||
|
|
||||||
|
info["content"] = ""
|
||||||
|
|
||||||
|
for text in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
|
||||||
|
|
||||||
|
text = self.html2text.handle(text.content.decode("utf-8"))
|
||||||
|
|
||||||
|
if len(info["content"]) + len(text) <= self.content_length:
|
||||||
|
info["content"] += text
|
||||||
|
else:
|
||||||
|
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||||
|
break
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
@ -10,4 +10,6 @@ chardet
|
|||||||
fonttools
|
fonttools
|
||||||
brotli
|
brotli
|
||||||
unicodedata2
|
unicodedata2
|
||||||
slate
|
pdfminer.six
|
||||||
|
ebooklib
|
||||||
|
html2text
|
13
spec/EbookParserTest.py
Normal file
13
spec/EbookParserTest.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from unittest import TestCase
|
||||||
|
from parsing import EbookParser
|
||||||
|
|
||||||
|
|
||||||
|
class EbookParserTest(TestCase):
|
||||||
|
|
||||||
|
def test_parse_content(self):
|
||||||
|
|
||||||
|
parser = EbookParser([], 1000)
|
||||||
|
|
||||||
|
info = parser.parse("test_files/epub1.epub")
|
||||||
|
|
||||||
|
self.assertEqual(len(info["content"]), 1000)
|
@ -10,7 +10,7 @@ class FontParserTest(TestCase):
|
|||||||
|
|
||||||
info = parser.parse("test_files/truetype1.ttf")
|
info = parser.parse("test_files/truetype1.ttf")
|
||||||
|
|
||||||
self.assertEqual(info["font_name"], "Liberation Mono Bold")
|
self.assertEqual(info["content"], "Liberation Mono Bold")
|
||||||
|
|
||||||
def test_parse_name_openType(self):
|
def test_parse_name_openType(self):
|
||||||
|
|
||||||
@ -18,7 +18,7 @@ class FontParserTest(TestCase):
|
|||||||
|
|
||||||
info = parser.parse("test_files/opentype1.otf")
|
info = parser.parse("test_files/opentype1.otf")
|
||||||
|
|
||||||
self.assertEqual(info["font_name"], "Linux Biolinum Keyboard O")
|
self.assertEqual(info["content"], "Linux Biolinum Keyboard O")
|
||||||
|
|
||||||
def test_parse_name_woff(self):
|
def test_parse_name_woff(self):
|
||||||
|
|
||||||
@ -26,7 +26,7 @@ class FontParserTest(TestCase):
|
|||||||
|
|
||||||
info = parser.parse("test_files/woff.woff")
|
info = parser.parse("test_files/woff.woff")
|
||||||
|
|
||||||
self.assertEqual(info["font_name"], "Heart of Gold")
|
self.assertEqual(info["content"], "Heart of Gold")
|
||||||
|
|
||||||
def test_parse_name_woff2(self):
|
def test_parse_name_woff2(self):
|
||||||
|
|
||||||
@ -34,4 +34,4 @@ class FontParserTest(TestCase):
|
|||||||
|
|
||||||
info = parser.parse("test_files/woff2.woff2")
|
info = parser.parse("test_files/woff2.woff2")
|
||||||
|
|
||||||
self.assertEqual(info["font_name"], "Heart of Gold")
|
self.assertEqual(info["content"], "Heart of Gold")
|
||||||
|
14
spec/PdfFileParser.py
Normal file
14
spec/PdfFileParser.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from unittest import TestCase
|
||||||
|
from parsing import PdfFileParser
|
||||||
|
|
||||||
|
|
||||||
|
class PdfParserTest(TestCase):
|
||||||
|
|
||||||
|
def test_parse_content(self):
|
||||||
|
|
||||||
|
parser = PdfFileParser([], 12488)
|
||||||
|
|
||||||
|
info = parser.parse("test_files/pdf1.pdf")
|
||||||
|
|
||||||
|
self.assertEqual(len(info["content"]), 12488)
|
||||||
|
self.assertTrue(info["content"].startswith("Rabies\n03/11/2011\nRabies"))
|
BIN
spec/test_files/epub1.epub
Normal file
BIN
spec/test_files/epub1.epub
Normal file
Binary file not shown.
BIN
spec/test_files/pdf1.pdf
Normal file
BIN
spec/test_files/pdf1.pdf
Normal file
Binary file not shown.
159
tmp.py
159
tmp.py
@ -1,56 +1,127 @@
|
|||||||
from elasticsearch import Elasticsearch
|
#!/usr/bin/env python
|
||||||
from indexer import Indexer
|
|
||||||
import json
|
|
||||||
from crawler import Crawler
|
|
||||||
from indexer import Indexer
|
|
||||||
from parsing import GenericFileParser, Sha256CheckSumCalculator, ExtensionMimeGuesser
|
|
||||||
|
|
||||||
es = Elasticsearch()
|
"""
|
||||||
1
|
Converts PDF text content (though not images containing text) to plain text, html, xml or "tags".
|
||||||
|
"""
|
||||||
# reset
|
import sys
|
||||||
es.indices.delete(index="test")
|
import logging
|
||||||
es.indices.create(index="test")
|
import six
|
||||||
es.indices.close(index="test")
|
import pdfminer.settings
|
||||||
|
pdfminer.settings.STRICT = False
|
||||||
|
import pdfminer.high_level
|
||||||
|
import pdfminer.layout
|
||||||
|
from pdfminer.image import ImageWriter
|
||||||
|
|
||||||
|
|
||||||
# # config
|
def extract_text(files=[], outfile='-',
|
||||||
es.indices.put_settings(body='{"analysis": {"analyzer": {"path_analyser": {'
|
_py2_no_more_posargs=None, # Bloody Python2 needs a shim
|
||||||
'"tokenizer": "path_tokenizer"}}, "tokenizer": {"path_tokenizer": {'
|
no_laparams=False, all_texts=None, detect_vertical=None, # LAParams
|
||||||
'"type": "path_hierarchy"}}}}', index="test")
|
word_margin=None, char_margin=None, line_margin=None, boxes_flow=None, # LAParams
|
||||||
|
output_type='text', codec='utf-8', strip_control=False,
|
||||||
|
maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0,
|
||||||
|
layoutmode='normal', output_dir=None, debug=False,
|
||||||
|
disable_caching=False, **other):
|
||||||
|
if _py2_no_more_posargs is not None:
|
||||||
|
raise ValueError("Too many positional arguments passed.")
|
||||||
|
if not files:
|
||||||
|
raise ValueError("Must provide files to work upon!")
|
||||||
|
|
||||||
es.indices.put_mapping(body='{"properties": {'
|
# If any LAParams group arguments were passed, create an LAParams object and
|
||||||
'"name": {"type": "text", "analyzer": "path_analyser", "copy_to": "suggest-path"},'
|
# populate with given args. Otherwise, set it to None.
|
||||||
'"suggest-path": {"type": "completion", "analyzer": "keyword"},'
|
if not no_laparams:
|
||||||
'"mime": {"type": "keyword"}'
|
laparams = pdfminer.layout.LAParams()
|
||||||
'}}', index="test",doc_type="file" )
|
for param in ("all_texts", "detect_vertical", "word_margin", "char_margin", "line_margin", "boxes_flow"):
|
||||||
|
paramv = locals().get(param, None)
|
||||||
|
if paramv is not None:
|
||||||
|
setattr(laparams, param, paramv)
|
||||||
|
else:
|
||||||
|
laparams = None
|
||||||
|
|
||||||
es.indices.open(index="test")
|
imagewriter = None
|
||||||
|
if output_dir:
|
||||||
|
imagewriter = ImageWriter(output_dir)
|
||||||
|
|
||||||
|
if output_type == "text" and outfile != "-":
|
||||||
|
for override, alttype in ( (".htm", "html"),
|
||||||
|
(".html", "html"),
|
||||||
|
(".xml", "xml"),
|
||||||
|
(".tag", "tag") ):
|
||||||
|
if outfile.endswith(override):
|
||||||
|
output_type = alttype
|
||||||
|
|
||||||
|
if outfile == "-":
|
||||||
|
outfp = sys.stdout
|
||||||
|
if outfp.encoding is not None:
|
||||||
|
codec = 'utf-8'
|
||||||
|
else:
|
||||||
|
outfp = open(outfile, "wb")
|
||||||
|
|
||||||
|
|
||||||
# add docs
|
for fname in files:
|
||||||
|
with open(fname, "rb") as fp:
|
||||||
|
pdfminer.high_level.extract_text_to_fp(fp, **locals())
|
||||||
|
return outfp
|
||||||
|
|
||||||
# crawler = Crawler([GenericFileParser([Sha256CheckSumCalculator()], ExtensionMimeGuesser())])
|
# main
|
||||||
# crawler.crawl("spec/test_folder")
|
def main(args=None):
|
||||||
#
|
import argparse
|
||||||
# indexer = Indexer("test")
|
P = argparse.ArgumentParser(description=__doc__)
|
||||||
#
|
P.add_argument("files", type=str, default=None, nargs="+", help="Files to process.")
|
||||||
# indexer.index(crawler.documents)
|
P.add_argument("-d", "--debug", default=False, action="store_true", help="Debug output.")
|
||||||
|
P.add_argument("-p", "--pagenos", type=str, help="Comma-separated list of page numbers to parse. Included for legacy applications, use -P/--page-numbers for more idiomatic argument entry.")
|
||||||
|
P.add_argument("--page-numbers", type=int, default=None, nargs="+", help="Alternative to --pagenos with space-separated numbers; supercedes --pagenos where it is used.")
|
||||||
|
P.add_argument("-m", "--maxpages", type=int, default=0, help = "Maximum pages to parse")
|
||||||
|
P.add_argument("-P", "--password", type=str, default="", help = "Decryption password for PDF")
|
||||||
|
P.add_argument("-o", "--outfile", type=str, default="-", help="Output file (default/'-' is stdout)")
|
||||||
|
P.add_argument("-t", "--output_type", type=str, default="text", help = "Output type: text|html|xml|tag (default is text)")
|
||||||
|
P.add_argument("-c", "--codec", type=str, default="utf-8", help = "Text encoding")
|
||||||
|
P.add_argument("-s", "--scale", type=float, default=1.0, help = "Scale")
|
||||||
|
P.add_argument("-A", "--all-texts", default=None, action="store_true", help="LAParams all texts")
|
||||||
|
P.add_argument("-V", "--detect-vertical", default=None, action="store_true", help="LAParams detect vertical")
|
||||||
|
P.add_argument("-W", "--word-margin", type=float, default=None, help = "LAParams word margin")
|
||||||
|
P.add_argument("-M", "--char-margin", type=float, default=None, help = "LAParams char margin")
|
||||||
|
P.add_argument("-L", "--line-margin", type=float, default=None, help = "LAParams line margin")
|
||||||
|
P.add_argument("-F", "--boxes-flow", type=float, default=None, help = "LAParams boxes flow")
|
||||||
|
P.add_argument("-Y", "--layoutmode", default="normal", type=str, help="HTML Layout Mode")
|
||||||
|
P.add_argument("-n", "--no-laparams", default=False, action="store_true", help = "Pass None as LAParams")
|
||||||
|
P.add_argument("-R", "--rotation", default=0, type=int, help = "Rotation")
|
||||||
|
P.add_argument("-O", "--output-dir", default=None, help="Output directory for images")
|
||||||
|
P.add_argument("-C", "--disable-caching", default=False, action="store_true", help="Disable caching")
|
||||||
|
P.add_argument("-S", "--strip-control", default=False, action="store_true", help="Strip control in XML mode")
|
||||||
|
A = P.parse_args(args=args)
|
||||||
|
|
||||||
# search
|
if A.page_numbers:
|
||||||
# print(es.search("test", "file", '{"query": {"term": {"name": "spec/test_folder/sub2/"}}}'))
|
A.page_numbers = set([x-1 for x in A.page_numbers])
|
||||||
# print(es.search("test", "file", '{"query": {"match_all": {}}, "aggs": {"test": {"terms": {"field": "mime"}}}}'))
|
if A.pagenos:
|
||||||
# suggest = es.search("test", "file", '{"suggest": {"path-suggest": {"prefix": "spec/test_folder/sub", "completion": {"field": "suggest-path"}}}}')
|
A.page_numbers = set([int(x)-1 for x in A.pagenos.split(",")])
|
||||||
#
|
|
||||||
# print(suggest["suggest"]["path-suggest"])
|
|
||||||
#
|
|
||||||
# for hit in suggest["suggest"]["path-suggest"][0]["options"]:
|
|
||||||
# print(hit["text"])
|
|
||||||
|
|
||||||
# indexer = Indexer("test")
|
imagewriter = None
|
||||||
|
if A.output_dir:
|
||||||
|
imagewriter = ImageWriter(A.output_dir)
|
||||||
|
|
||||||
# import time
|
if six.PY2 and sys.stdin.encoding:
|
||||||
# time.sleep(10)
|
A.password = A.password.decode(sys.stdin.encoding)
|
||||||
|
|
||||||
c = Crawler([])
|
if A.output_type == "text" and A.outfile != "-":
|
||||||
c.countFiles("/")
|
for override, alttype in ( (".htm", "html"),
|
||||||
|
(".html", "html"),
|
||||||
|
(".xml", "xml" ),
|
||||||
|
(".tag", "tag" ) ):
|
||||||
|
if A.outfile.endswith(override):
|
||||||
|
A.output_type = alttype
|
||||||
|
|
||||||
|
if A.outfile == "-":
|
||||||
|
outfp = sys.stdout
|
||||||
|
if outfp.encoding is not None:
|
||||||
|
# Why ignore outfp.encoding? :-/ stupid cathal?
|
||||||
|
A.codec = 'utf-8'
|
||||||
|
else:
|
||||||
|
outfp = open(A.outfile, "wb")
|
||||||
|
|
||||||
|
## Test Code
|
||||||
|
outfp = extract_text(**vars(A))
|
||||||
|
outfp.close()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__': sys.exit(main())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user