mirror of
https://github.com/simon987/Simple-Incremental-Search-Tool.git
synced 2025-12-14 07:39:05 +00:00
Audio tags in search page and svg thumbnail generation
This commit is contained in:
111
parsing.py
111
parsing.py
@@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
|
||||
"text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
|
||||
"text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
|
||||
"text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
|
||||
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
|
||||
]
|
||||
|
||||
self.encodings = [
|
||||
'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
|
||||
'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
|
||||
'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
|
||||
'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
|
||||
'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
|
||||
'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
|
||||
'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
|
||||
'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
|
||||
'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
|
||||
'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
|
||||
'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
|
||||
'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
|
||||
'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
|
||||
'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
|
||||
'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
|
||||
'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
|
||||
'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
|
||||
"text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
|
||||
"text/x-makefile"
|
||||
]
|
||||
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as text_file:
|
||||
raw_content = text_file.read(self.content_length)
|
||||
if self.content_length > 0:
|
||||
with open(full_path, "rb") as text_file:
|
||||
raw_content = text_file.read(self.content_length)
|
||||
|
||||
chardet.detect(raw_content)
|
||||
encoding = chardet.detect(raw_content)["encoding"]
|
||||
chardet.detect(raw_content)
|
||||
encoding = chardet.detect(raw_content)["encoding"]
|
||||
|
||||
if encoding is not None and encoding in self.encodings:
|
||||
info["encoding"] = encoding
|
||||
content = raw_content.decode(encoding, "ignore")
|
||||
|
||||
info["content"] = html.escape(content)
|
||||
if encoding is not None:
|
||||
info["encoding"] = encoding
|
||||
try:
|
||||
content = raw_content.decode(encoding, "ignore")
|
||||
info["content"] = html.escape(content)
|
||||
except Exception:
|
||||
print("Unknown encoding: " + encoding)
|
||||
|
||||
return info
|
||||
|
||||
@@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
|
||||
def parse(self, full_path: str):
|
||||
info = super().parse(full_path)
|
||||
|
||||
with open(full_path, "rb") as f:
|
||||
if self.content_length > 0:
|
||||
with open(full_path, "rb") as f:
|
||||
|
||||
info["content"] = ""
|
||||
info["content"] = ""
|
||||
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
parser = PDFParser(f)
|
||||
document = PDFDocument(parser)
|
||||
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
if isinstance(document.info[0]["Title"], bytes):
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
else:
|
||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||
if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
|
||||
if isinstance(document.info[0]["Title"], bytes):
|
||||
info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
|
||||
else:
|
||||
info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
|
||||
|
||||
try:
|
||||
if document.is_extractable:
|
||||
resource_manager = PDFResourceManager()
|
||||
la_params = LAParams()
|
||||
try:
|
||||
if document.is_extractable:
|
||||
resource_manager = PDFResourceManager()
|
||||
la_params = LAParams()
|
||||
|
||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
device = PDFPageAggregator(resource_manager, laparams=la_params)
|
||||
interpreter = PDFPageInterpreter(resource_manager, device)
|
||||
|
||||
for page in PDFPage.create_pages(document):
|
||||
for page in PDFPage.create_pages(document):
|
||||
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
|
||||
for lt_obj in layout:
|
||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||
for lt_obj in layout:
|
||||
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||
|
||||
text = lt_obj.get_text()
|
||||
text = lt_obj.get_text()
|
||||
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
print("PDF is not extractable: " + full_path)
|
||||
except ValueError:
|
||||
print("Couldn't parse page for " + full_path)
|
||||
if len(info["content"]) + len(text) <= self.content_length:
|
||||
info["content"] += text
|
||||
else:
|
||||
info["content"] += text[0:self.content_length - len(info["content"])]
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
print("PDF is not extractable: " + full_path)
|
||||
except ValueError:
|
||||
print("Couldn't parse page for " + full_path)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
Reference in New Issue
Block a user