Audio tags in search page and svg thumbnail generation

2025-12-14 07:39:05 +00:00 · 2018-04-17 11:45:31 -04:00
parent dff7ddc511
commit 4eb9cf6b63
8 changed files with 128 additions and 105 deletions
--- a/parsing.py
+++ b/parsing.py
@@ -279,43 +279,27 @@ class TextFileParser(GenericFileParser):
            "text/x-perl", "text/x-dsrc", "text/scriptlet", "text/x-scala", "text/calendar",
            "text/x-bibtex", "text/x-tcl", "text/x-c++", "text/x-shellscript", "text/x-msdos-batch",
            "text/x-makefile", "text/rtf", "text/x-objective-c", "text/troff", "text/x-m4",
-            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po"
-        ]
-
-        self.encodings = [
-            'ascii', 'big5', 'big5hkscs', 'cp037', 'cp273', 'cp424', 'cp437',
-            'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855',
-            'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863',
-            'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932',
-            'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1125', 'cp1140',
-            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
-            'cp1256', 'cp1257', 'cp1258', 'cp65001', 'euc_jp', 'euc_jis_2004',
-            'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp',
-            'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3',
-            'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3',
-            'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8',
-            'iso8859_9', 'iso8859_10', 'iso8859_11', 'iso8859_13', 'iso8859_14',
-            'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_t', 'koi8_u',
-            'kz1048', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2',
-            'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004',
-            'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be',
-            'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig'
+            "text/x-lisp", "text/x-php", "text/x-gawk", "text/x-awk", "text/x-ruby", "text/x-po",
+            "text/x-makefile"
        ]

    def parse(self, full_path: str):
        info = super().parse(full_path)

-        with open(full_path, "rb") as text_file:
-            raw_content = text_file.read(self.content_length)
+        if self.content_length > 0:
+            with open(full_path, "rb") as text_file:
+                raw_content = text_file.read(self.content_length)

-            chardet.detect(raw_content)
-            encoding = chardet.detect(raw_content)["encoding"]
+                chardet.detect(raw_content)
+                encoding = chardet.detect(raw_content)["encoding"]

-            if encoding is not None and encoding in self.encodings:
-                info["encoding"] = encoding
-                content = raw_content.decode(encoding, "ignore")
-
-                info["content"] = html.escape(content)
+                if encoding is not None:
+                    info["encoding"] = encoding
+                    try:
+                        content = raw_content.decode(encoding, "ignore")
+                        info["content"] = html.escape(content)
+                    except Exception:
+                        print("Unknown encoding: " + encoding)

        return info

@@ -373,49 +357,50 @@ class PdfFileParser(GenericFileParser):
    def parse(self, full_path: str):
        info = super().parse(full_path)

-        with open(full_path, "rb") as f:
+        if self.content_length > 0:
+            with open(full_path, "rb") as f:

-            info["content"] = ""
+                info["content"] = ""

-            parser = PDFParser(f)
-            document = PDFDocument(parser)
+                parser = PDFParser(f)
+                document = PDFDocument(parser)

-            if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
-                if isinstance(document.info[0]["Title"], bytes):
-                    info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
-                else:
-                    info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"
+                if len(document.info) > 0 and "Title" in document.info[0] and document.info[0]["Title"] != b"":
+                    if isinstance(document.info[0]["Title"], bytes):
+                        info["content"] += document.info[0]["Title"].decode("utf-8", "replace") + "\n"
+                    else:
+                        info["content"] += document.info[0]["Title"].resolve().decode("utf-8", "replace") + "\n"

-            try:
-                if document.is_extractable:
-                    resource_manager = PDFResourceManager()
-                    la_params = LAParams()
+                try:
+                    if document.is_extractable:
+                        resource_manager = PDFResourceManager()
+                        la_params = LAParams()

-                    device = PDFPageAggregator(resource_manager, laparams=la_params)
-                    interpreter = PDFPageInterpreter(resource_manager, device)
+                        device = PDFPageAggregator(resource_manager, laparams=la_params)
+                        interpreter = PDFPageInterpreter(resource_manager, device)

-                    for page in PDFPage.create_pages(document):
+                        for page in PDFPage.create_pages(document):

-                        interpreter.process_page(page)
-                        layout = device.get_result()
+                            interpreter.process_page(page)
+                            layout = device.get_result()

-                        for lt_obj in layout:
-                            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
+                            for lt_obj in layout:
+                                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):

-                                text = lt_obj.get_text()
+                                    text = lt_obj.get_text()

-                                if len(info["content"]) + len(text) <= self.content_length:
-                                    info["content"] += text
-                                else:
-                                    info["content"] += text[0:self.content_length - len(info["content"])]
-                                    break
-                        else:
-                            continue
-                        break
-                else:
-                    print("PDF is not extractable: " + full_path)
-            except ValueError:
-                print("Couldn't parse page for " + full_path)
+                                    if len(info["content"]) + len(text) <= self.content_length:
+                                        info["content"] += text
+                                    else:
+                                        info["content"] += text[0:self.content_length - len(info["content"])]
+                                        break
+                            else:
+                                continue
+                            break
+                    else:
+                        print("PDF is not extractable: " + full_path)
+                except ValueError:
+                    print("Couldn't parse page for " + full_path)

        return info