Compare commits

..

No commits in common. "bb91139ffbf9477310f650589ad4fcdc18dc3702" and "cd71551a225d40ba675994295ab21cbb0ffbae40" have entirely different histories.

82 changed files with 89 additions and 5950 deletions

View File

@ -10,7 +10,7 @@ steps:
- name: build
image: simon987/sist2-build
commands:
- ./scripts/build.sh
- ./ci/build.sh
- name: docker
image: plugins/docker
settings:
@ -55,7 +55,7 @@ steps:
- name: build
image: simon987/sist2-build-arm64
commands:
- ./scripts/build_arm64.sh
- ./ci/build_arm64.sh
- name: scp files
image: appleboy/drone-scp
settings:

4
.gitignore vendored
View File

@ -10,13 +10,13 @@ Makefile
LOG
sist2*
!sist2-vue/
*.sist2/
index.sist2/
bundle*.css
bundle.js
*.a
vgcore.*
build/
third-party/argparse
third-party/
*.idx/
VERSION
git_hash.h

8
.gitmodules vendored
View File

@ -3,10 +3,4 @@
url = https://github.com/simon987/libscan
[submodule "third-party/argparse"]
path = third-party/argparse
url = https://github.com/simon987/argparse
[submodule "third-party/libscan/third-party/utf8.h"]
path = third-party/libscan/third-party/utf8.h
url = https://github.com/sheredom/utf8.h
[submodule "third-party/libscan/third-party/antiword"]
path = third-party/libscan/third-party/antiword
url = https://github.com/simon987/antiword
url = https://github.com/cofyc/argparse

View File

@ -6,10 +6,12 @@ COPY . .
RUN cmake -DSIST_PLATFORM=x64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE_TOOLCHAIN_FILE=/vcpkg/scripts/buildsystems/vcpkg.cmake .
RUN make -j$(nproc)
RUN strip sist2
RUN ls -lh
RUN ls -lh sist2-vue/dist/
FROM ubuntu:21.10
FROM ubuntu:20.10
RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
RUN apt update && apt install -y curl libasan5
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@ -20,9 +22,9 @@ RUN mkdir -p /usr/share/tessdata && \
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
ENTRYPOINT ["/root/sist2"]
COPY --from=build /build/sist2 /root/sist2
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
COPY --from=build /build/sist2 /root/sist2
ENTRYPOINT ["/root/sist2"]

View File

@ -7,9 +7,9 @@ RUN cmake -DSIST_PLATFORM=arm64_linux -DSIST_DEBUG=off -DBUILD_TESTS=off -DCMAKE
RUN make -j$(nproc)
RUN strip sist2
FROM --platform="linux/arm64/v8" ubuntu:21.10
FROM ubuntu:20.10
RUN apt update && apt install -y curl libasan5 && rm -rf /var/lib/apt/lists/*
RUN apt update && apt install -y curl libasan5
RUN mkdir -p /usr/share/tessdata && \
cd /usr/share/tessdata/ && \
@ -20,9 +20,9 @@ RUN mkdir -p /usr/share/tessdata && \
curl -o /usr/share/tessdata/rus.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/rus.traineddata &&\
curl -o /usr/share/tessdata/spa.traineddata https://raw.githubusercontent.com/tesseract-ocr/tessdata/master/spa.traineddata
COPY --from=build /build/sist2 /root/sist2
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
ENTRYPOINT ["/root/sist2"]
COPY --from=build /build/sist2 /root/sist2
ENTRYPOINT ["/root/sist2"]

View File

@ -2,7 +2,7 @@
[![CodeFactor](https://www.codefactor.io/repository/github/simon987/sist2/badge?s=05daa325188aac4eae32c786f3d9cf4e0593f822)](https://www.codefactor.io/repository/github/simon987/sist2)
[![Development snapshots](https://ci.simon987.net/api/badges/simon987/sist2/status.svg)](https://files.simon987.net/.gate/sist2/simon987_sist2/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/)
**Demo**: [sist2.simon987.net](https://sist2.simon987.net/?i=Demo%20files)
# sist2
@ -33,11 +33,12 @@ sist2 (Simple incremental search tool)
## Getting Started
1. Have an Elasticsearch (>= 6.8.X, ideally >=7.14.0) instance running
1. Have an Elasticsearch (>= 6.X.X) instance running
1. Download [from official website](https://www.elastic.co/downloads/elasticsearch)
1. *(or)* Run using docker:
```bash
docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.14.0
docker run -d --name es1 --net sist2_net -p 9200:9200 \
-e "discovery.type=single-node" elasticsearch:7.14.0
```
1. *(or)* Run using docker-compose:
```yaml
@ -51,7 +52,7 @@ sist2 (Simple incremental search tool)
1. Download the [latest sist2 release](https://github.com/simon987/sist2/releases) *
1. *(or)* Download a [development snapshot](https://files.simon987.net/.gate/sist2/simon987_sist2/) *(Not
recommended!)*
1. *(or)* `docker pull simon987/sist2:2.11.4-x64-linux`
1. *(or)* `docker pull simon987/sist2:2.11.3-x64-linux`
1. See [Usage guide](docs/USAGE.md)

View File

@ -14,7 +14,6 @@
* [examples](#web-examples)
* [rewrite_url](#rewrite_url)
* [link to specific indices](#link-to-specific-indices)
* [elasticsearch](#elasticsearch)
* [exec-script](#exec-script)
* [tagging](#tagging)
* [sidecar files](#sidecar-files)
@ -267,20 +266,9 @@ sist2 web index1 index2 index3 index4
When the `rewrite_url` field is not empty, the web module ignores the `root`
field and will return a HTTP redirect to `<rewrite_url><path>/<name><extension>`
instead of serving the file from disk.
Both the `root` and `rewrite_url` fields are safe to manually modify from the
Both the `root` and `rewrite_url` fields are safe to manually modify from the
`descriptor.json` file.
# Elasticsearch
Elasticsearch versions >=6.8.0, <8.0.0 are supported by sist2.
Using a version >=7.14.0 is recommended to enable the following features:
- Bug fix for large documents (See #198)
When using a legacy version of ES, a notice will be displayed next to the sist2 version in the web UI.
If you don't care about the features above, you can ignore it or disable it in the configuration page.
## exec-script
The `exec-script` command is used to execute a user script for an index that has already been imported to Elasticsearch with the `index` command. Note that the documents will not be reset to their default state before each execution as the `index` command does: if you make undesired changes to the documents by accident, you will need to run `index` again to revert to the original state.

View File

@ -78,7 +78,6 @@
"name": {
"analyzer": "content_analyzer",
"type": "text",
"fielddata": true,
"fields": {
"nGram": {
"type": "text",

View File

@ -1,58 +0,0 @@
{
"index": {
"refresh_interval": "30s",
"codec": "best_compression",
"number_of_replicas": 0
},
"analysis": {
"tokenizer": {
"path_tokenizer": {
"type": "path_hierarchy",
"delimiter": "/"
},
"tag_tokenizer": {
"type": "path_hierarchy",
"delimiter": "."
},
"my_nGram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 3
}
},
"analyzer": {
"path_analyzer": {
"tokenizer": "path_tokenizer",
"filter": [
"lowercase"
]
},
"tag_analyzer": {
"tokenizer": "tag_tokenizer",
"filter": [
"lowercase"
]
},
"case_insensitive_kw_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase"
]
},
"my_nGram": {
"tokenizer": "my_nGram_tokenizer",
"filter": [
"lowercase",
"asciifolding"
]
},
"content_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
}

View File

@ -3,7 +3,6 @@ import json
files = [
"schema/mappings.json",
"schema/settings.json",
"schema/settings_legacy.json",
"schema/pipeline.json",
]

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -51,7 +51,6 @@ export interface EsHit {
duration: number
tag: string[]
checksum: string
thumbnail: string
}
_props: {
isSubDocument: boolean
@ -62,8 +61,6 @@ export interface EsHit {
isPlayableImage: boolean
isAudio: boolean
hasThumbnail: boolean
tnW: number
tnH: number
}
highlight: {
name: string[] | undefined,
@ -134,8 +131,6 @@ class Sist2Api {
if ("thumbnail" in hit._source) {
hit._props.hasThumbnail = true;
hit._props.tnW = Number(hit._source.thumbnail.split(",")[0]);
hit._props.tnH = Number(hit._source.thumbnail.split(",")[1]);
}
switch (mimeCategory) {

View File

@ -43,20 +43,6 @@ const SORT_MODES = {
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.size
},
nameAsc: {
mode: [
{name: {order: "asc"}},
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.name
},
nameDesc: {
mode: [
{name: {order: "desc"}},
{_tie: {order: "asc"}}
],
key: (hit: EsHit) => hit._source.name
}
} as any;
@ -87,8 +73,6 @@ class Sist2Query {
const selectedMimeTypes = getters.selectedMimeTypes;
const selectedTags = getters.selectedTags;
const legacyES = store.state.sist2Info.esVersionLegacy;
const filters = [
{terms: {index: selectedIndexIds}}
] as any[];
@ -203,13 +187,9 @@ class Sist2Query {
"name.nGram": {},
"content.nGram": {},
font_name: {},
}
},
max_analyzed_offset: 9_999_999
};
if (!legacyES) {
q.highlight.max_analyzed_offset = 9_999_999;
}
if (getters.optSearchInPath) {
q.highlight.fields["path.text"] = {};
q.highlight.fields["path.nGram"] = {};

View File

@ -5,6 +5,7 @@
<b-card-body>
<!-- TODO: ES connectivity, Link to GH page -->
<b-table :items="tableItems" small borderless responsive="md" thead-class="hidden" class="mb-0"></b-table>
<hr />
@ -31,9 +32,6 @@ export default {
{key: "esIndex", value: this.$store.state.sist2Info.esIndex},
{key: "tagline", value: this.$store.state.sist2Info.tagline},
{key: "dev", value: this.$store.state.sist2Info.dev},
{key: "esVersion", value: this.$store.state.sist2Info.esVersion},
{key: "esVersionSupported", value: this.$store.state.sist2Info.esVersionSupported},
{key: "esVersionLegacy", value: this.$store.state.sist2Info.esVersionLegacy},
]
}
}

View File

@ -15,15 +15,11 @@
<span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
</div>
<div
v-if="doc._props.isImage && !hover && doc._props.tnW / doc._props.tnH < 5"
class="card-img-overlay"
:class="{'small-badge': smallBadge}">
<div v-if="doc._props.isImage && !hover" class="card-img-overlay" :class="{'small-badge': smallBadge}">
<span class="badge badge-resolution">{{ `${doc._source.width}x${doc._source.height}` }}</span>
</div>
<div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover"
class="card-img-overlay"
<div v-if="(doc._props.isVideo || doc._props.isGif) && doc._source.duration > 0 && !hover" class="card-img-overlay"
:class="{'small-badge': smallBadge}">
<span class="badge badge-resolution">{{ humanTime(doc._source.duration) }}</span>
</div>
@ -43,8 +39,7 @@
</div>
<!-- Audio player-->
<audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls
:type="doc._source.mime"
<audio v-if="doc._props.isAudio" ref="audio" preload="none" class="audio-fit fit" controls :type="doc._source.mime"
:src="`f/${doc._id}`"
@play="onAudioPlay()"></audio>

View File

@ -7,27 +7,11 @@
value-field="id"></b-form-select>
</div>
<div v-else>
<div class="d-flex justify-content-between align-content-center">
<span>
{{ selectedIndices.length }}
{{ selectedIndices.length === 1 ? $t("indexPicker.selectedIndex") : $t("indexPicker.selectedIndices") }}
</span>
<div>
<b-button variant="link" @click="selectAll()"> {{ $t("indexPicker.selectAll") }}</b-button>
<b-button variant="link" @click="selectNone()"> {{ $t("indexPicker.selectNone") }}</b-button>
</div>
</div>
<b-list-group id="index-picker-desktop" class="unselectable">
<b-list-group id="index-picker-desktop">
<b-list-group-item
v-for="idx in indices"
@click="toggleIndex(idx, $event)"
@click.shift="shiftClick(idx, $event)"
class="d-flex justify-content-between align-items-center list-group-item-action pointer"
:class="{active: lastClickIndex === idx}"
>
@click="toggleIndex(idx)"
class="d-flex justify-content-between align-items-center list-group-item-action pointer">
<div class="d-flex">
<b-checkbox @change="toggleIndex(idx)" :checked="isSelected(idx)"></b-checkbox>
{{ idx.name }}
@ -52,7 +36,6 @@ export default Vue.extend({
data() {
return {
loading: true,
lastClickIndex: null
}
},
computed: {
@ -70,50 +53,13 @@ export default Vue.extend({
...mapActions({
setSelectedIndices: "setSelectedIndices"
}),
shiftClick(index, e) {
if (this.lastClickIndex === null) {
return;
}
const select = this.isSelected(this.lastClickIndex);
let leftBoundary = this.indices.indexOf(this.lastClickIndex);
let rightBoundary = this.indices.indexOf(index);
if (rightBoundary < leftBoundary) {
let tmp = leftBoundary;
leftBoundary = rightBoundary;
rightBoundary = tmp;
}
for (let i = leftBoundary; i <= rightBoundary; i++) {
if (select) {
if (!this.isSelected(this.indices[i])) {
this.setSelectedIndices([this.indices[i], ...this.selectedIndices]);
}
} else {
this.setSelectedIndices(this.selectedIndices.filter(idx => idx !== this.indices[i]));
}
}
},
selectAll() {
this.setSelectedIndices(this.indices);
},
selectNone() {
this.setSelectedIndices([]);
},
onSelect(value) {
this.setSelectedIndices(this.indices.filter(idx => value.includes(idx.id)));
},
formatIdxDate(timestamp: number): string {
return format(new Date(timestamp * 1000), "yyyy-MM-dd");
},
toggleIndex(index, e) {
if (e.shiftKey) {
return;
}
this.lastClickIndex = index;
toggleIndex(index) {
if (this.isSelected(index)) {
this.setSelectedIndices(this.selectedIndices.filter(idx => idx.id != index.id));
} else {
@ -146,21 +92,4 @@ export default Vue.extend({
overflow-y: auto;
max-height: 132px;
}
.btn-link:focus {
box-shadow: none;
}
.unselectable {
user-select: none;
-ms-user-select: none;
-moz-user-select: none;
-webkit-user-select: none;
}
.list-group-item.active {
z-index: 2;
background-color: inherit;
color: inherit;
}
</style>

View File

@ -21,9 +21,6 @@ export default {
if (mutation.type === "setUiMimeMap") {
const mimeMap = mutation.payload.slice();
const elem = document.getElementById("mimeTree");
console.log(elem);
this.mimeTree = new InspireTree({
selection: {
mode: 'checkbox'

View File

@ -8,8 +8,7 @@
</b-navbar-brand>
<span class="badge badge-pill version" v-if="$store && $store.state.sist2Info">
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span><span v-if="isLegacy() && !hideLegacy()">-<a
href="https://github.com/simon987/sist2/blob/master/docs/USAGE.md#elasticsearch" target="_blank">legacyES</a></span>
v{{ sist2Version() }}<span v-if="isDebug()">-dbg</span>
</span>
<span v-if="$store && $store.state.sist2Info" class="tagline" v-html="tagline()"></span>
@ -21,7 +20,6 @@
<script>
import Sist2Icon from "@/components/Sist2Icon";
export default {
name: "NavBar",
components: {Sist2Icon},
@ -34,12 +32,6 @@ export default {
},
isDebug() {
return this.$store.state.sist2Info.debug;
},
isLegacy() {
return this.$store.state.sist2Info.esVersionLegacy;
},
hideLegacy() {
return this.$store.state.optHideLegacy;
}
}
}
@ -103,7 +95,7 @@ export default {
}
}
.theme-light .btn-link {
.theme-light .btn-link{
color: #222;
}
</style>

View File

@ -5,11 +5,9 @@
<div style="float: right">
<b-button v-b-toggle.collapse-1 variant="primary" class="not-mobile">{{ $t("details") }}</b-button>
<template v-if="hitCount !== 0">
<SortSelect class="ml-2"></SortSelect>
<SortSelect class="ml-2"></SortSelect>
<DisplayModeToggle class="ml-2"></DisplayModeToggle>
</template>
<DisplayModeToggle class="ml-2"></DisplayModeToggle>
</div>
<b-collapse id="collapse-1" class="pt-2" style="clear:both;">
@ -23,7 +21,7 @@
<script lang="ts">
import {EsResult} from "@/Sist2Api";
import Vue from "vue";
import {humanFileSize} from "@/util";
import {humanFileSize, humanTime} from "@/util";
import DisplayModeToggle from "@/components/DisplayModeToggle.vue";
import SortSelect from "@/components/SortSelect.vue";

View File

@ -19,14 +19,6 @@
{{ $t("sort.sizeDesc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'nameDesc'}" @click="onSelect('nameDesc')">
{{ $t("sort.nameDesc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'nameAsc'}" @click="onSelect('nameAsc')">
{{ $t("sort.nameAsc") }}
</b-dropdown-item>
<b-dropdown-item :class="{'dropdown-active': sort === 'random'}" @click="onSelect('random')">
{{ $t("sort.random") }}
</b-dropdown-item>

View File

@ -63,8 +63,7 @@ export default {
slideDuration: "Slide duration",
resultSize: "Number of results per page",
tagOrOperator: "Use OR operator when specifying multiple tags.",
hideDuplicates: "Hide duplicate results based on checksum",
hideLegacy: "Hide the 'legacyES' Elasticsearch notice"
hideDuplicates: "Hide duplicate results based on checksum"
},
queryMode: {
simple: "Simple",
@ -131,14 +130,13 @@ export default {
saveTagModalTitle: "Add tag",
saveTagPlaceholder: "Tag name",
confirm: "Confirm",
indexPickerPlaceholder: "Select indices",
sort: {
relevance: "Relevance",
dateAsc: "Date (Older first)",
dateDesc: "Date (Newer first)",
sizeAsc: "Size (Smaller first)",
sizeDesc: "Size (Larger first)",
nameAsc: "Name (A-z)",
nameDesc: "Name (Z-a)",
random: "Random",
},
d3: {
@ -146,13 +144,7 @@ export default {
mimeSize: "Size distribution by media type",
dateHistogram: "File modification time distribution",
sizeHistogram: "File size distribution",
},
indexPicker: {
selectNone: "Select None",
selectAll: "Select All",
selectedIndex: "selected index",
selectedIndices: "selected indices",
},
}
},
fr: {
searchBar: {
@ -219,8 +211,7 @@ export default {
slideDuration: "Durée des diapositives",
resultSize: "Nombre de résultats par page",
tagOrOperator: "Utiliser l'opérateur OU lors de la spécification de plusieurs tags",
hideDuplicates: "Masquer les résultats en double",
hideLegacy: "Masquer la notice 'legacyES' Elasticsearch"
hideDuplicates: "Masquer les résultats en double"
},
queryMode: {
simple: "Simple",
@ -295,8 +286,6 @@ export default {
dateDesc: "Date (Plus récent)",
sizeAsc: "Taille (Plus petit)",
sizeDesc: "Taille (Plus grand)",
nameAsc: "Nom (A-z)",
nameDesc: "Nom (Z-a)",
random: "Aléatoire",
},
d3: {
@ -304,12 +293,6 @@ export default {
mimeSize: "Distribution des tailles de fichiers par type de média",
dateHistogram: "Distribution des dates de modification",
sizeHistogram: "Distribution des tailles de fichier",
},
indexPicker: {
selectNone: "Sélectionner aucun",
selectAll: "Sélectionner tout",
selectedIndex: "indice sélectionné",
selectedIndices: "indices sélectionnés",
},
}
}
}

View File

@ -46,7 +46,6 @@ export default new Vuex.Store({
optTreemapColor: "PuBuGn",
optLightboxLoadOnlyCurrent: false,
optLightboxSlideDuration: 15,
optHideLegacy: false,
_onLoadSelectedIndices: [] as string[],
_onLoadSelectedMimeTypes: [] as string[],
@ -145,7 +144,6 @@ export default new Vuex.Store({
setOptTreemapColorGroupingDepth: (state, val) => state.optTreemapColorGroupingDepth = val,
setOptTreemapSize: (state, val) => state.optTreemapSize = val,
setOptTreemapColor: (state, val) => state.optTreemapColor = val,
setOptHideLegacy: (state, val) => state.optHideLegacy = val,
setOptLightboxLoadOnlyCurrent: (state, val) => state.optLightboxLoadOnlyCurrent = val,
@ -341,6 +339,5 @@ export default new Vuex.Store({
optLightboxLoadOnlyCurrent: state => state.optLightboxLoadOnlyCurrent,
optLightboxSlideDuration: state => state.optLightboxSlideDuration,
optResultSize: state => state.size,
optHideLegacy: state => state.optHideLegacy,
}
})

View File

@ -19,10 +19,6 @@
{{ $t("opt.lightboxLoadOnlyCurrent") }}
</b-form-checkbox>
<b-form-checkbox :checked="optHideLegacy" @input="setOptHideLegacy">
{{ $t("opt.hideLegacy") }}
</b-form-checkbox>
<label>{{ $t("opt.lang") }}</label>
<b-form-select :options="langOptions" :value="optLang" @input="setOptLang"></b-form-select>
@ -219,7 +215,6 @@ export default {
"optTagOrOperator",
"optLang",
"optHideDuplicates",
"optHideLegacy",
]),
clientWidth() {
return window.innerWidth;
@ -259,8 +254,7 @@ export default {
"setOptResultSize",
"setOptTagOrOperator",
"setOptLang",
"setOptHideDuplicates",
"setOptHideLegacy"
"setOptHideDuplicates"
]),
onResetClick() {
localStorage.removeItem("sist2_configuration");

View File

@ -31,7 +31,7 @@
</b-row>
</b-col>
<b-col>
<b-tabs justified>
<b-tabs>
<b-tab :title="$t('mimeTypes')">
<MimePicker></MimePicker>
</b-tab>
@ -43,13 +43,9 @@
</b-row>
</b-card>
<div v-show="docs.length === 0 && !uiLoading">
<Preloader v-if="searchBusy" class="mt-3"></Preloader>
<Preloader v-if="searchBusy && docs.length === 0" class="mt-3"></Preloader>
<ResultsCard></ResultsCard>
</div>
<div v-if="docs.length > 0">
<div v-else-if="docs.length > 0">
<ResultsCard></ResultsCard>
<DocCardWall v-if="optDisplay==='grid'" :docs="docs" :append="appendFunc"></DocCardWall>
@ -113,6 +109,10 @@ export default Vue.extend({
}, 350, {leading: false});
Sist2Api.getMimeTypes().then(mimeMap => {
this.$store.commit("setUiMimeMap", mimeMap);
});
this.$store.dispatch("loadFromArgs", this.$route).then(() => {
this.$store.subscribe(() => this.$store.dispatch("updateArgs", this.$router));
this.$store.subscribe((mutation) => {
@ -138,13 +138,9 @@ export default Vue.extend({
sist2.getSist2Info().then(data => {
this.setSist2Info(data);
this.setIndices(data.indices);
this.uiLoading = false;
Sist2Api.getMimeTypes().then(mimeMap => {
this.$store.commit("setUiMimeMap", mimeMap);
this.uiLoading = false;
this.search(true);
});
this.search(true);
}).catch(() => {
this.showErrorToast();
});
@ -213,7 +209,7 @@ export default Vue.extend({
resp.hits.hits = resp.hits.hits.filter(hit => {
if (!("checksum" in hit._source)) {
return true;
return true;
}
const isDupe = !this.docChecksums.has(hit._source.checksum);

View File

@ -2,7 +2,6 @@
ScanCtx_t ScanCtx = {
.stat_index_size = 0,
.stat_tn_size = 0,
.dbg_current_files = NULL,
.pool = NULL
};

View File

@ -17,7 +17,6 @@
#include "libscan/wpd/wpd.h"
#include "libscan/json/json.h"
#include "src/io/store.h"
#include "src/index/elastic.h"
#include <glib.h>
#include <pcre.h>
@ -76,7 +75,6 @@ typedef struct {
typedef struct {
char *es_url;
es_version_t *es_version;
char *es_index;
int batch_size;
tpool_t *pool;
@ -88,7 +86,6 @@ typedef struct {
typedef struct {
char *es_url;
es_version_t *es_version;
char *es_index;
int index_count;
char *auth_user;

View File

@ -253,7 +253,7 @@ void _elastic_flush(int max) {
} else {
print_errors(r);
LOG_DEBUGF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
LOG_INFOF("elastic.c", "Indexed %d documents (%zukB) <%d>", count, buf_len / 1024, r->status_code);
delete_queue(max);
if (Indexer->queued != 0) {
@ -356,65 +356,7 @@ void finish_indexer(char *script, int async_script, char *index_id) {
free_response(r);
}
es_version_t *elastic_get_version(const char *es_url) {
response_t *r = web_get(es_url, 30);
char *tmp = malloc(r->size + 1);
memcpy(tmp, r->body, r->size);
*(tmp + r->size) = '\0';
cJSON *response = cJSON_Parse(tmp);
free(tmp);
free_response(r);
if (response == NULL) {
return NULL;
}
if (cJSON_GetObjectItem(response, "version") == NULL ||
cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number") == NULL) {
cJSON_Delete(response);
return NULL;
}
char *version_str = cJSON_GetObjectItem(cJSON_GetObjectItem(response, "version"), "number")->valuestring;
es_version_t *version = malloc(sizeof(es_version_t));
const char *tok = strtok(version_str, ".");
version->major = atoi(tok);
tok = strtok(NULL, ".");
version->minor = atoi(tok);
tok = strtok(NULL, ".");
version->patch = atoi(tok);
cJSON_Delete(response);
return version;
}
void elastic_init(int force_reset, const char *user_mappings, const char *user_settings) {
es_version_t *es_version = elastic_get_version(IndexCtx.es_url);
IndexCtx.es_version = es_version;
if (es_version == NULL) {
LOG_FATAL("elastic.c", "Could not get ES version")
}
LOG_INFOF("elastic.c",
"Elasticsearch version is %s (supported=%d, legacy=%d)",
format_es_version(es_version), IS_SUPPORTED_ES_VERSION(es_version), USE_LEGACY_ES_SETTINGS(es_version));
if (!IS_SUPPORTED_ES_VERSION(es_version)) {
LOG_FATAL("elastic.c", "sist2 only supports Elasticsearch v6.8 or newer")
}
char *settings = NULL;
if (USE_LEGACY_ES_SETTINGS(es_version)) {
settings = settings_json;
} else {
settings = settings_legacy_json;
}
void elastic_init(int force_reset, const char* user_mappings, const char* user_settings) {
// Check if index exists
char url[4096];
@ -450,7 +392,7 @@ void elastic_init(int force_reset, const char *user_mappings, const char *user_s
free_response(r);
snprintf(url, sizeof(url), "%s/%s/_settings", IndexCtx.es_url, IndexCtx.es_index);
r = web_put(url, user_settings ? user_settings : settings);
r = web_put(url, user_settings ? user_settings : settings_json);
LOG_INFOF("elastic.c", "Update ES settings <%d>", r->status_code);
if (r->status_code != 200) {
print_error(r);

View File

@ -9,26 +9,6 @@ typedef struct es_bulk_line {
char line[0];
} es_bulk_line_t;
typedef struct {
int major;
int minor;
int patch;
} es_version_t;
#define VERSION_GE(version, maj, min) ((version)->major > (maj) || ((version)->major == (maj) && (version)->minor >= (min)))
#define IS_SUPPORTED_ES_VERSION(es_version) VERSION_GE((es_version), 6, 8)
#define USE_LEGACY_ES_SETTINGS(es_version) (!VERSION_GE((es_version), 7, 14))
__always_inline
static const char *format_es_version(es_version_t *version) {
static char buf[64];
snprintf(buf, sizeof(buf), "%d.%d.%d", version->major, version->minor, version->patch);
return buf;
}
/**
* Note: indexer is *not* thread safe
*/
@ -51,8 +31,6 @@ cJSON *elastic_get_document(const char *id_str);
char *elastic_get_status();
es_version_t *elastic_get_version(const char *es_url);
void execute_update_script(const char *script, int async, const char index_id[MD5_STR_LENGTH]);
#endif

File diff suppressed because one or more lines are too long

View File

@ -23,6 +23,7 @@ store_t *store_create(const char *path, size_t chunk_size) {
}
store->size = (size_t) store->chunk_size;
ScanCtx.stat_tn_size = 0;
mdb_env_set_mapsize(store->env, store->size);
// Open dbi

View File

@ -43,36 +43,26 @@ int sub_strings[30];
int handle_entry(const char *filepath, const struct stat *info, int typeflag, struct FTW *ftw) {
if (ftw->level > ScanCtx.depth) {
if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
return FTW_CONTINUE;
}
if (typeflag == FTW_F && S_ISREG(info->st_mode) && ftw->level <= ScanCtx.depth) {
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
if (ScanCtx.exclude != NULL && EXCLUDED(filepath)) {
LOG_DEBUGF("walk.c", "Excluded: %s", filepath)
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
pthread_mutex_lock(&ScanCtx.dbg_file_counts_mu);
ScanCtx.dbg_excluded_files_count += 1;
pthread_mutex_unlock(&ScanCtx.dbg_file_counts_mu);
return 0;
} else if (typeflag == FTW_D) {
return FTW_SKIP_SUBTREE;
}
}
if (typeflag == FTW_F && S_ISREG(info->st_mode)) {
parse_job_t *job = create_fs_parse_job(filepath, info, ftw->base);
tpool_add_work(ScanCtx.pool, parse, job);
}
return FTW_CONTINUE;
return 0;
}
#define MAX_FILE_DESCRIPTORS 64
int walk_directory_tree(const char *dirpath) {
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_ACTIONRETVAL);
return nftw(dirpath, handle_entry, MAX_FILE_DESCRIPTORS, FTW_PHYS | FTW_DEPTH);
}

View File

@ -55,14 +55,10 @@ void vsist_logf(const char *filepath, int level, char *format, va_list ap) {
log_len += 1;
}
if (PrintingProgressBar) {
PrintingProgressBar = FALSE;
memmove(log_str + 1, log_str, log_len);
log_str[0] = '\n';
log_len += 1;
int ret = write(STDERR_FILENO, log_str, log_len);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno))
}
write(STDERR_FILENO, log_str, log_len);
}
void sist_logf(const char *filepath, int level, char *format, ...) {
@ -108,12 +104,8 @@ void sist_log(const char *filepath, int level, char *str) {
);
}
if (PrintingProgressBar) {
PrintingProgressBar = FALSE;
memmove(log_str + 1, log_str, log_len);
log_str[0] = '\n';
log_len += 1;
int ret = write(STDERR_FILENO, log_str, log_len);
if (ret == -1) {
LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
}
write(STDERR_FILENO, log_str, log_len);
}

View File

@ -433,7 +433,7 @@ void sist2_index(index_args_t *args) {
cleanup = elastic_cleanup;
}
IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, args->print == 0);
IndexCtx.pool = tpool_create(args->threads, cleanup, FALSE, FALSE);
tpool_start(IndexCtx.pool);
struct dirent *de;
@ -518,8 +518,8 @@ void sist2_web(web_args_t *args) {
int main(int argc, const char *argv[]) {
sigsegv_handler = signal(SIGSEGV, sig_handler);
sigabrt_handler = signal(SIGABRT, sig_handler);
// sigsegv_handler = signal(SIGSEGV, sig_handler);
// sigabrt_handler = signal(SIGABRT, sig_handler);
setlocale(LC_ALL, "");

View File

@ -1,8 +1,6 @@
#ifndef SIST_H
#define SIST_H
#define _GNU_SOURCE
#ifndef FALSE
#define FALSE (0)
#define BOOL int
@ -53,7 +51,7 @@
#include <ctype.h>
#include "git_hash.h"
#define VERSION "2.11.5"
#define VERSION "2.11.3"
static const char *const Version = VERSION;
#ifndef SIST_PLATFORM

View File

@ -177,7 +177,7 @@ static void *tpool_worker(void *arg) {
}
void tpool_wait(tpool_t *pool) {
LOG_DEBUG("tpool.c", "Waiting for worker threads to finish")
LOG_INFO("tpool.c", "Waiting for worker threads to finish")
pthread_mutex_lock(&(pool->work_mutex));
while (TRUE) {
if (pool->done_cnt < pool->work_cnt) {
@ -191,9 +191,7 @@ void tpool_wait(tpool_t *pool) {
}
}
}
if (pool->print_progress) {
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
}
progress_bar_print(1.0, ScanCtx.stat_tn_size, ScanCtx.stat_index_size);
pthread_mutex_unlock(&(pool->work_mutex));
LOG_INFO("tpool.c", "Worker threads finished")

View File

@ -84,13 +84,11 @@ char *expandpath(const char *path) {
return expanded;
}
int PrintingProgressBar = 0;
void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
static int last_val = -1;
int val = (int) (percentage * 100);
if (last_val == val || val > 100) {
if (last_val == val || val > 100 || index_size < 1024) {
return;
}
last_val = val;
@ -116,21 +114,13 @@ void progress_bar_print(double percentage, size_t tn_size, size_t index_size) {
index_unit = 'M';
}
if (tn_size == 0 && index_size == 0) {
fprintf(stderr,
"\r%3d%%[%.*s>%*s]",
val, lpad, PBSTR, rpad, ""
);
} else {
fprintf(stderr,
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit
);
}
PrintingProgressBar = TRUE;
printf(
"\r%3d%%[%.*s>%*s] TN:%3d%c IDX:%3d%c",
val, lpad, PBSTR, rpad, "",
(int) tn_size, tn_unit,
(int) index_size, index_unit
);
fflush(stdout);
}
GHashTable *incremental_get_table() {

View File

@ -19,8 +19,6 @@ char *expandpath(const char *path);
dyn_buffer_t url_escape(char *str);
extern int PrintingProgressBar;
void progress_bar_print(double percentage, size_t tn_size, size_t index_size);
GHashTable *incremental_get_table();

View File

@ -252,32 +252,12 @@ void serve_file_from_disk(cJSON *json, index_t *idx, struct mg_connection *nc, s
mg_http_serve_file(nc, hm, full_path, mime, disposition);
}
void cache_es_version() {
static int is_cached = FALSE;
if (is_cached == TRUE) {
return;
}
es_version_t *es_version = elastic_get_version(WebCtx.es_url);
if (es_version != NULL) {
WebCtx.es_version = es_version;
is_cached = TRUE;
}
}
void index_info(struct mg_connection *nc) {
cache_es_version();
cJSON *json = cJSON_CreateObject();
cJSON *arr = cJSON_AddArrayToObject(json, "indices");
cJSON_AddStringToObject(json, "esIndex", WebCtx.es_index);
cJSON_AddStringToObject(json, "version", Version);
cJSON_AddStringToObject(json, "esVersion", format_es_version(WebCtx.es_version));
cJSON_AddBoolToObject(json, "esVersionSupported", IS_SUPPORTED_ES_VERSION(WebCtx.es_version));
cJSON_AddBoolToObject(json, "esVersionLegacy", USE_LEGACY_ES_SETTINGS(WebCtx.es_version));
cJSON_AddStringToObject(json, "platform", QUOTE(SIST_PLATFORM));
cJSON_AddStringToObject(json, "sist2Hash", Sist2CommitHash);
cJSON_AddStringToObject(json, "libscanHash", LibScanCommitHash);

File diff suppressed because one or more lines are too long

@ -1 +1 @@
Subproject commit 225141eb3df2fc1711962e3779646423407cb3f5
Subproject commit ffd9c23427d0cb105e27f27f0cf97b463b6a8bf8

1
third-party/libscan vendored Submodule

@ -0,0 +1 @@
Subproject commit 3787475ecba7453a2a97ab470103606c2cecabb2

View File

@ -1,12 +0,0 @@
.idea/
cmake_install.cmake
Makefile
libscan.a
libscan.so
*.cbp
CMakeFiles
CMakeCache.txt
scan_test
third-party/ext_*
libscan-test-files
scan_*_test

View File

@ -1,233 +0,0 @@
cmake_minimum_required(VERSION 3.15)
project(scan)
set(CMAKE_C_STANDARD 11)
option(BUILD_TESTS "Build tests" on)
add_subdirectory(third-party/antiword)
add_compile_definitions(
antiword
NDEBUG
)
add_library(
scan
libscan/util.c libscan/util.h
libscan/scan.h
libscan/macros.h
libscan/text/text.c libscan/text/text.h
libscan/arc/arc.c libscan/arc/arc.h
libscan/ebook/ebook.c libscan/ebook/ebook.h
libscan/comic/comic.c libscan/comic/comic.h
libscan/ooxml/ooxml.c libscan/ooxml/ooxml.h
libscan/media/media.c libscan/media/media.h
libscan/font/font.c libscan/font/font.h
libscan/msdoc/msdoc.c libscan/msdoc/msdoc.h
libscan/json/json.c libscan/json/json.h
libscan/wpd/wpd.c libscan/wpd/wpd.h libscan/wpd/libwpd_c_api.h libscan/wpd/libwpd_c_api.cpp
third-party/utf8.h
libscan/mobi/scan_mobi.c libscan/mobi/scan_mobi.h libscan/raw/raw.c libscan/raw/raw.h)
set_target_properties(scan PROPERTIES LINKER_LANGUAGE C)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a .lib .so)
find_package(cJSON CONFIG REQUIRED)
find_package(LibArchive REQUIRED)
find_package(BZip2 REQUIRED)
find_package(lz4 REQUIRED)
find_package(Threads REQUIRED)
find_package(Tesseract CONFIG REQUIRED)
find_package(OpenJPEG CONFIG REQUIRED)
find_package(JPEG REQUIRED)
find_package(LibXml2 REQUIRED)
find_package(LibLZMA REQUIRED)
find_package(ZLIB REQUIRED)
find_package(unofficial-pcre CONFIG REQUIRED)
find_library(JBIG2DEC_LIB NAMES jbig2decd jbig2dec)
find_library(HARFBUZZ_LIB NAMES harfbuzz harfbuzzd)
find_library(FREETYPE_LIB NAMES freetype freetyped)
find_package(unofficial-brotli CONFIG REQUIRED)
find_library(LZO2_LIB NAMES lzo2)
find_library(RAW_LIB NAMES libraw.a)
find_library(MUPDF_LIB NAMES liblibmupdf.a)
find_library(CMS_LIB NAMES lcms2)
find_library(JAS_LIB NAMES jasper)
find_library(GUMBO_LIB NAMES gumbo)
find_library(GOMP_LIB NAMES libgomp.a gomp PATHS /usr/lib/gcc/x86_64-linux-gnu/5/ /usr/lib/gcc/x86_64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/10/ /usr/lib/gcc/aarch64-linux-gnu/7/ /usr/lib/gcc/aarch64-linux-gnu/9/ /usr/lib/gcc/x86_64-linux-gnu/7/)
target_compile_options(
scan
PRIVATE
-g
)
include(ExternalProject)
find_program(MAKE_EXE NAMES gmake nmake make)
ExternalProject_Add(
libmobi
GIT_REPOSITORY https://github.com/simon987/libmobi.git
GIT_TAG "public"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./autogen.sh && ./configure
INSTALL_COMMAND ""
PREFIX "third-party/ext_libmobi"
SOURCE_DIR "third-party/ext_libmobi/src/libmobi"
BINARY_DIR "third-party/ext_libmobi/src/libmobi"
BUILD_COMMAND ${MAKE_EXE} -j 8 --silent
)
SET(MOBI_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/.libs/)
SET(MOBI_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libmobi/src/libmobi/src/)
if (SIST_DEBUG)
SET(FFMPEG_DEBUG "--enable-debug=3" "--disable-optimizations")
else()
SET(FFMPEG_DEBUG "")
endif()
ExternalProject_Add(
ffmpeg
GIT_REPOSITORY https://git.ffmpeg.org/ffmpeg.git
GIT_TAG "n4.4"
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --disable-shared --enable-static --disable-ffmpeg --disable-ffplay
--disable-ffprobe --disable-doc --disable-manpages --disable-postproc --disable-avfilter --disable-alsa
--disable-lzma --disable-xlib --disable-vdpau --disable-vaapi --disable-sdl2
--disable-network ${FFMPEG_DEBUG}
INSTALL_COMMAND ""
PREFIX "third-party/ext_ffmpeg"
SOURCE_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BINARY_DIR "third-party/ext_ffmpeg/src/ffmpeg"
BUILD_COMMAND ${MAKE_EXE} -j33 --silent
)
SET(FFMPEG_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
SET(FFMPEG_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_ffmpeg/src/ffmpeg)
ExternalProject_Add(
libwpd
URL http://prdownloads.sourceforge.net/libwpd/libwpd-0.9.9.tar.gz
UPDATE_COMMAND ""
PATCH_COMMAND ""
TEST_COMMAND ""
CONFIGURE_COMMAND ./configure --without-docs --enable-static --disable-shared
INSTALL_COMMAND ""
PREFIX "third-party/ext_libwpd"
SOURCE_DIR "third-party/ext_libwpd/src/libwpd"
BINARY_DIR "third-party/ext_libwpd/src/libwpd"
BUILD_COMMAND ${MAKE_EXE} -j33
)
SET(WPD_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/src/lib/.libs/)
SET(WPD_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/third-party/ext_libwpd/src/libwpd/inc/)
add_dependencies(
scan
libmobi
ffmpeg
antiword
libwpd
)
target_link_libraries(
scan
PUBLIC
cjson
${LibArchive_LIBRARIES}
ZLIB::ZLIB
BZip2::BZip2
lz4::lz4
${LZO2_LIB}
LibLZMA::LibLZMA
${MUPDF_LIB}
openjp2
${MOBI_LIB_DIR}/libmobi.a
${WPD_LIB_DIR}/libwpd-0.9.a
${WPD_LIB_DIR}/libwpd-stream-0.9.a
${FREETYPE_LIB}
${HARFBUZZ_LIB}
${JBIG2DEC_LIB}
stdc++
-Wl,--whole-archive
m
-Wl,--no-whole-archive
${JPEG_LIBRARIES}
${Tesseract_LIBRARIES}
${LIBXML2_LIBRARIES}
${FREETYPE_LIB}
unofficial::brotli::brotlidec-static
${FFMPEG_LIB_DIR}/libavformat/libavformat.a
${FFMPEG_LIB_DIR}/libavcodec/libavcodec.a
${FFMPEG_LIB_DIR}/libavutil/libavutil.a
${FFMPEG_LIB_DIR}/libswresample/libswresample.a
${FFMPEG_LIB_DIR}/libswscale/libswscale.a
z
${CMAKE_THREAD_LIBS_INIT}
${RAW_LIB}
${GOMP_LIB}
${CMS_LIB}
${JAS_LIB}
${GUMBO_LIB}
dl
antiword
unofficial::pcre::pcre unofficial::pcre::pcre16 unofficial::pcre::pcre32 unofficial::pcre::pcrecpp
)
target_include_directories(
scan
PUBLIC
${MUPDF_INC_DIR}
${JPEG_INCLUDE_DIR}
${LIBXML2_INCLUDE_DIR}
${FFMPEG_INCLUDE_DIR}
${MOBI_INCLUDE_DIR}
${WPD_INCLUDE_DIR}
)
if (BUILD_TESTS)
find_package(GTest CONFIG REQUIRED)
add_executable(scan_ub_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_ub_test PRIVATE -g -fsanitize=undefined -fno-omit-frame-pointer)
target_link_libraries(scan_ub_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=undefined scan)
add_executable(scan_a_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_a_test PRIVATE -g -fsanitize=address -fno-omit-frame-pointer)
target_link_libraries(scan_a_test PRIVATE GTest::gtest GTest::gtest_main -fsanitize=address scan)
add_executable(scan_test test/main.cpp test/test_util.cpp test/test_util.h)
target_compile_options(scan_test PRIVATE -g -fno-omit-frame-pointer)
target_link_libraries(scan_test PRIVATE GTest::gtest GTest::gtest_main scan)
endif()

View File

@ -1,4 +0,0 @@
### Run fuzz tests:
```bash
./scan_a_test --gtest_filter=*Fuzz* --gtest_repeat=100
```

View File

@ -1,244 +0,0 @@
#include "arc.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <openssl/evp.h>
#include <pcre.h>
int should_parse_filtered_file(const char *filepath, int ext) {
char tmp[PATH_MAX * 2];
if (ext == 0) {
return FALSE;
}
if (strncmp(filepath + ext, "tgz", 3) == 0) {
return TRUE;
}
memcpy(tmp, filepath, ext - 1);
*(tmp + ext - 1) = '\0';
char *idx = strrchr(tmp, '.');
if (idx == NULL) {
return FALSE;
}
if (strcmp(idx, ".tar") == 0) {
return TRUE;
}
return FALSE;
}
void arc_close(struct vfile *f) {
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
if (f->rewind_buffer != NULL) {
free(f->rewind_buffer);
f->rewind_buffer = NULL;
f->rewind_buffer_size = 0;
f->rewind_buffer_cursor = 0;
}
}
int arc_read(struct vfile *f, void *buf, size_t size) {
int bytes_copied = 0;
if (f->rewind_buffer_size != 0) {
if (size > f->rewind_buffer_size) {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, f->rewind_buffer_size);
bytes_copied = f->rewind_buffer_size;
size -= f->rewind_buffer_size;
buf += f->rewind_buffer_size;
f->rewind_buffer_size = 0;
} else {
memcpy(buf, f->rewind_buffer + f->rewind_buffer_cursor, size);
f->rewind_buffer_size -= (int) size;
f->rewind_buffer_cursor += (int) size;
return (int) size;
}
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != 0 && bytes_read <= size && f->calculate_checksum) {
f->has_checksum = TRUE;
safe_sha1_update(&f->sha1_ctx, (unsigned char *) buf, bytes_read);
}
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
return (int) bytes_read + bytes_copied;
}
int arc_read_rewindable(struct vfile *f, void *buf, size_t size) {
if (f->rewind_buffer != NULL) {
fprintf(stderr, "Allocated rewind buffer more than once for %s", f->filepath);
exit(-1);
}
size_t bytes_read = archive_read_data(f->arc, buf, size);
if (bytes_read != size && archive_errno(f->arc) != 0) {
const char *error_str = archive_error_string(f->arc);
if (error_str != NULL) {
f->logf(f->filepath, LEVEL_ERROR, "Error reading archive file: %s", error_str);
}
return -1;
}
f->rewind_buffer = malloc(size);
f->rewind_buffer_size = (int) size;
f->rewind_buffer_cursor = 0;
memcpy(f->rewind_buffer, buf, size);
return (int) bytes_read;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse) {
arc_data->f = f;
if (f->is_fs_file) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open_filename(*a, f->filepath, ARC_BUF_SIZE);
} else if (allow_recurse) {
*a = archive_read_new();
archive_read_support_filter_all(*a);
archive_read_support_format_all(*a);
if (ctx->passphrase[0] != 0) {
archive_read_add_passphrase(*a, ctx->passphrase);
}
return archive_read_open(
*a, arc_data,
vfile_open_callback,
vfile_read_callback,
vfile_close_callback
);
} else {
return ARC_SKIPPED;
}
}
static __thread int sub_strings[30];
#define EXCLUDED(str) (pcre_exec(exclude, exclude_extra, str, strlen(str), 0, 0, sub_strings, sizeof(sub_strings)) >= 0)
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
arc_data.f = f;
int ret = arc_open(ctx, f, &a, &arc_data, ctx->mode == ARC_MODE_RECURSE);
if (ret == ARC_SKIPPED) {
return SCAN_OK;
}
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(arc.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return SCAN_ERR_READ;
}
if (ctx->mode == ARC_MODE_LIST) {
dyn_buffer_t buf = dyn_buffer_create();
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
dyn_buffer_append_string(&buf, file_path);
dyn_buffer_write_char(&buf, ' ');
}
}
dyn_buffer_write_char(&buf, '\0');
meta_line_t *meta_list = malloc(sizeof(meta_line_t) + buf.cur);
meta_list->key = MetaContent;
strcpy(meta_list->str_val, buf.buf);
APPEND_META(doc, meta_list)
dyn_buffer_destroy(&buf);
} else {
parse_job_t *sub_job = malloc(sizeof(parse_job_t) + PATH_MAX * 2);
sub_job->vfile.close = arc_close;
sub_job->vfile.read = arc_read;
sub_job->vfile.read_rewindable = arc_read_rewindable;
sub_job->vfile.reset = NULL;
sub_job->vfile.arc = a;
sub_job->vfile.filepath = sub_job->filepath;
sub_job->vfile.is_fs_file = FALSE;
sub_job->vfile.rewind_buffer_size = 0;
sub_job->vfile.rewind_buffer = NULL;
sub_job->vfile.log = ctx->log;
sub_job->vfile.logf = ctx->logf;
sub_job->vfile.has_checksum = FALSE;
sub_job->vfile.calculate_checksum = f->calculate_checksum;
memcpy(sub_job->parent, doc->path_md5, MD5_DIGEST_LENGTH);
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
sub_job->vfile.info = *archive_entry_stat(entry);
if (S_ISREG(sub_job->vfile.info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
if (utf8_name == NULL) {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, archive_entry_pathname(entry));
} else {
sprintf(sub_job->filepath, "%s#/%s", f->filepath, utf8_name);
}
sub_job->base = (int) (strrchr(sub_job->filepath, '/') - sub_job->filepath) + 1;
// Handle excludes
if (exclude != NULL && EXCLUDED(sub_job->filepath)) {
CTX_LOG_DEBUGF("arc.c", "Excluded: %s", sub_job->filepath)
continue;
}
char *p = strrchr(sub_job->filepath, '.');
if (p != NULL && (p - sub_job->filepath) > strlen(f->filepath)) {
sub_job->ext = (int) (p - sub_job->filepath + 1);
} else {
sub_job->ext = (int) strlen(sub_job->filepath);
}
SHA1_Init(&sub_job->vfile.sha1_ctx);
ctx->parse(sub_job);
}
}
free(sub_job);
}
archive_read_free(a);
return SCAN_OK;
}

View File

@ -1,80 +0,0 @@
#ifndef SCAN_ARC_H
#define SCAN_ARC_H
#include <archive.h>
#include <archive_entry.h>
#include <fcntl.h>
#include <pcre.h>
#include "../scan.h"
# define ARC_SKIPPED (-1)
#define ARC_MODE_SKIP 0
#define ARC_MODE_LIST 1
#define ARC_MODE_SHALLOW 2
#define ARC_MODE_RECURSE 3
typedef int archive_mode_t;
typedef struct {
archive_mode_t mode;
parse_callback_t parse;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
char passphrase[4096];
} scan_arc_ctx_t;
#define ARC_BUF_SIZE 8192
typedef struct {
vfile_t *f;
char buf[ARC_BUF_SIZE];
} arc_data_t;
static int vfile_open_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Init(&data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
static long vfile_read_callback(struct archive *a, void *user_data, const void **buf) {
arc_data_t *data = (arc_data_t *) user_data;
*buf = data->buf;
long ret = data->f->read(data->f, data->buf, sizeof(data->buf));
if (!data->f->is_fs_file && ret > 0) {
data->f->has_checksum = TRUE;
safe_sha1_update(&data->f->sha1_ctx, (unsigned char*)data->buf, ret);
}
return ret;
}
static int vfile_close_callback(struct archive *a, void *user_data) {
arc_data_t *data = (arc_data_t *) user_data;
if (!data->f->is_fs_file) {
SHA1_Final((unsigned char *) data->f->sha1_digest, &data->f->sha1_ctx);
}
return ARCHIVE_OK;
}
int arc_open(scan_arc_ctx_t *ctx, vfile_t *f, struct archive **a, arc_data_t *arc_data, int allow_recurse);
int should_parse_filtered_file(const char *filepath, int ext);
scan_code_t parse_archive(scan_arc_ctx_t *ctx, vfile_t *f, document_t *doc, pcre *exclude, pcre_extra *exclude_extra);
int arc_read(struct vfile *f, void *buf, size_t size);
int arc_read_rewindable(struct vfile *f, void *buf, size_t size);
void arc_close(struct vfile *f);
#endif

View File

@ -1,58 +0,0 @@
#include "comic.h"
#include "../media/media.h"
#include "../arc/arc.h"
#include <stdlib.h>
#include <archive.h>
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(cbr.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".png") == 0 || strcmp(p, ".jpg") == 0 || strcmp(p, ".jpeg") == 0)) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size);
size_t read = archive_read_data(a, buf, entry_size);
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("comic.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = store_image_thumbnail((scan_media_ctx_t *) ctx, buf, entry_size, doc, file_path);
free(buf);
if (ret == TRUE) {
break;
}
}
}
}
archive_read_free(a);
}

View File

@ -1,31 +0,0 @@
#ifndef SCAN_CBR_H
#define SCAN_CBR_H
#include <stdlib.h>
#include "../ebook/ebook.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
unsigned int cbr_mime;
unsigned int cbz_mime;
} scan_comic_ctx_t;
__always_inline
static int is_cbr(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbr_mime;
}
__always_inline
static int is_cbz(scan_comic_ctx_t *ctx, unsigned int mime) {
return mime == ctx->cbz_mime;
}
void parse_comic(scan_comic_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -1,495 +0,0 @@
#include "ebook.h"
#include <mupdf/fitz.h>
#include <pthread.h>
#include <tesseract/capi.h>
#include "../media/media.h"
#include "../arc/arc.h"
#define MIN_OCR_SIZE 350
#define MIN_OCR_LEN 10
/* fill_image callback doesn't let us pass opaque pointers unless I create my own device */
__thread text_buffer_t thread_buffer;
__thread scan_ebook_ctx_t thread_ctx;
pthread_mutex_t Mutex;
static void my_fz_lock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_lock(&Mutex);
}
}
static void my_fz_unlock(UNUSED(void *user), int lock) {
if (lock == FZ_LOCK_FREETYPE) {
pthread_mutex_unlock(&Mutex);
}
}
int pixmap_is_blank(const fz_pixmap *pixmap) {
int pixmap_size = pixmap->n * pixmap->w * pixmap->h;
const int pixel0 = pixmap->samples[0];
for (int i = 0; i < pixmap_size; i++) {
if (pixmap->samples[i] != pixel0) {
return FALSE;
}
}
return TRUE;
}
fz_pixmap *
load_pixmap(scan_ebook_ctx_t *ctx, int page, fz_context *fzctx, fz_document *fzdoc, document_t *doc, fz_page **cover) {
int err = 0;
fz_var(cover);
fz_var(err);
fz_try(fzctx)*cover = fz_load_page(fzctx, fzdoc, page);
fz_catch(fzctx)err = 1;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
return NULL;
}
fz_rect bounds = fz_bound_page(fzctx, *cover);
float scale;
float w = bounds.x1 - bounds.x0;
float h = bounds.y1 - bounds.y0;
if (w > h) {
scale = (float) ctx->tn_size / w;
} else {
scale = (float) ctx->tn_size / h;
}
fz_matrix m = fz_scale(scale, scale);
bounds = fz_transform_rect(bounds, m);
fz_irect bbox = fz_round_rect(bounds);
fz_pixmap *pixmap = fz_new_pixmap_with_bbox(fzctx, fz_device_rgb(fzctx), bbox, NULL, 0);
fz_clear_pixmap_with_value(fzctx, pixmap, 0xFF);
fz_device *dev = fz_new_draw_device(fzctx, m, pixmap);
fz_var(err);
fz_try(fzctx) {
fz_run_page(fzctx, *cover, dev, fz_identity, NULL);
} fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
if (pixmap->n != 3) {
CTX_LOG_ERRORF(doc->filepath, "Got unexpected pixmap depth: %d", pixmap->n)
fz_drop_page(fzctx, *cover);
fz_drop_pixmap(fzctx, pixmap);
return NULL;
}
return pixmap;
}
int render_cover(scan_ebook_ctx_t *ctx, fz_context *fzctx, document_t *doc, fz_document *fzdoc) {
fz_page *cover = NULL;
fz_pixmap *pixmap = load_pixmap(ctx, 0, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
if (pixmap_is_blank(pixmap)) {
fz_drop_page(fzctx, cover);
fz_drop_pixmap(fzctx, pixmap);
CTX_LOG_DEBUG(doc->filepath, "Cover page is blank, using page 1 instead")
pixmap = load_pixmap(ctx, 1, fzctx, fzdoc, doc, &cover);
if (pixmap == NULL) {
return FALSE;
}
}
// RGB24 -> YUV420p
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
pixmap->w, pixmap->h, AV_PIX_FMT_RGB24,
pixmap->w, pixmap->h, AV_PIX_FMT_YUV420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, pixmap->w, pixmap->h,
1);
unsigned char *samples = calloc(1, 1024 * 1024 * 1024);
memcpy(samples, pixmap->samples, pixmap->stride * pixmap->h);
const uint8_t *in_data[1] = {samples,};
int in_line_size[1] = {(int) pixmap->stride};
sws_scale(sws_ctx,
in_data, in_line_size,
0, pixmap->h,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = pixmap->w;
scaled_frame->height = pixmap->h;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
// YUV420p -> JPEG
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(pixmap->w, pixmap->h, ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, pixmap->w, pixmap->h)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
free(samples);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
fz_drop_pixmap(fzctx, pixmap);
fz_drop_page(fzctx, cover);
return TRUE;
}
void fz_err_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_WARNINGF(doc->filepath, "FZ: %s", message)
}
void fz_warn_callback(void *user, const char *message) {
document_t *doc = (document_t *) user;
const scan_ebook_ctx_t *ctx = &thread_ctx;
CTX_LOG_DEBUGF(doc->filepath, "FZ: %s", message)
}
static void init_fzctx(fz_context *fzctx, document_t *doc) {
fz_register_document_handlers(fzctx);
static int mu_is_initialized = FALSE;
if (!mu_is_initialized) {
pthread_mutex_init(&Mutex, NULL);
mu_is_initialized = TRUE;
}
fzctx->warn.print_user = doc;
fzctx->warn.print = fz_warn_callback;
fzctx->error.print_user = doc;
fzctx->error.print = fz_err_callback;
fzctx->locks.lock = my_fz_lock;
fzctx->locks.unlock = my_fz_unlock;
}
static int read_stext_block(fz_stext_block *block, text_buffer_t *tex) {
if (block->type != FZ_STEXT_BLOCK_TEXT) {
return 0;
}
fz_stext_line *line = block->u.t.first_line;
while (line != NULL) {
text_buffer_append_char(tex, ' ');
fz_stext_char *c = line->first_char;
while (c != NULL) {
if (text_buffer_append_char(tex, c->c) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
c = c->next;
}
line = line->next;
}
text_buffer_append_char(tex, ' ');
return 0;
}
#define IS_VALID_BPP(d) ((d)==1 || (d)==2 || (d)==4 || (d)==8 || (d)==16 || (d)==24 || (d)==32)
void fill_image(fz_context *fzctx, UNUSED(fz_device *dev),
fz_image *img, UNUSED(fz_matrix ctm), UNUSED(float alpha),
UNUSED(fz_color_params color_params)) {
int l2factor = 0;
if (img->w > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && IS_VALID_BPP(img->n)) {
fz_pixmap *pix = img->get_pixmap(fzctx, img, NULL, img->w, img->h, &l2factor);
if (pix->h > MIN_OCR_SIZE && img->h > MIN_OCR_SIZE && img->xres != 0) {
TessBaseAPI *api = TessBaseAPICreate();
TessBaseAPIInit3(api, thread_ctx.tesseract_path, thread_ctx.tesseract_lang);
TessBaseAPISetImage(api, pix->samples, pix->w, pix->h, pix->n, pix->stride);
TessBaseAPISetSourceResolution(api, pix->xres);
char *text = TessBaseAPIGetUTF8Text(api);
size_t len = strlen(text);
if (len >= MIN_OCR_LEN) {
text_buffer_append_string(&thread_buffer, text, len - 1);
}
TessBaseAPIEnd(api);
TessBaseAPIDelete(api);
}
fz_drop_pixmap(fzctx, pix);
}
}
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only) {
fz_context *fzctx = fz_new_context(NULL, NULL, FZ_STORE_DEFAULT);
thread_ctx = *ctx;
init_fzctx(fzctx, doc);
int err = 0;
fz_document *fzdoc = NULL;
fz_stream *stream = NULL;
fz_var(fzdoc);
fz_var(stream);
fz_var(err);
fz_try(fzctx) {
stream = fz_open_memory(fzctx, buf, buf_len);
fzdoc = fz_open_document_with_stream(fzctx, mime_str, stream);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
int page_count = -1;
fz_var(err);
fz_try(fzctx)page_count = fz_count_pages(fzctx, fzdoc);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err) {
CTX_LOG_WARNINGF(doc->filepath, "fz_count_pages() returned error code [%d] %s", err, fzctx->error.message)
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
APPEND_LONG_META(doc, MetaPages, page_count)
if (ctx->tn_size > 0) {
if (render_cover(ctx, fzctx, doc, fzdoc) == FALSE) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
}
if (tn_only) {
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
char title[8192] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_TITLE, title, sizeof(title));
fz_catch(fzctx);
if (strlen(title) > 0) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
char author[4096] = {'\0',};
fz_try(fzctx)fz_lookup_metadata(fzctx, fzdoc, FZ_META_INFO_AUTHOR, author, sizeof(author));
fz_catch(fzctx);
if (strlen(author) > 0) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
if (ctx->content_size > 0) {
fz_stext_options opts = {0};
thread_buffer = text_buffer_create(ctx->content_size);
for (int current_page = 0; current_page < page_count; current_page++) {
fz_page *page = NULL;
fz_var(err);
fz_try(fzctx)page = fz_load_page(fzctx, fzdoc, current_page);
fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_load_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_page *stext = fz_new_stext_page(fzctx, fz_bound_page(fzctx, page));
fz_device *dev = fz_new_stext_device(fzctx, stext, &opts);
dev->stroke_path = NULL;
dev->stroke_text = NULL;
dev->clip_text = NULL;
dev->clip_stroke_path = NULL;
dev->clip_stroke_text = NULL;
if (ctx->tesseract_lang != NULL) {
dev->fill_image = fill_image;
}
fz_var(err);
fz_try(fzctx)fz_run_page(fzctx, page, dev, fz_identity, NULL);
fz_always(fzctx) {
fz_close_device(fzctx, dev);
fz_drop_device(fzctx, dev);
} fz_catch(fzctx)err = fzctx->error.errcode;
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "fz_run_page() returned error code [%d] %s", err, fzctx->error.message)
text_buffer_destroy(&thread_buffer);
fz_drop_page(fzctx, page);
fz_drop_stext_page(fzctx, stext);
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
return;
}
fz_stext_block *block = stext->first_block;
while (block != NULL) {
int ret = read_stext_block(block, &thread_buffer);
if (ret == TEXT_BUF_FULL) {
break;
}
block = block->next;
}
fz_drop_stext_page(fzctx, stext);
fz_drop_page(fzctx, page);
if (thread_buffer.dyn_buffer.cur >= ctx->content_size) {
break;
}
}
text_buffer_terminate_string(&thread_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + thread_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, thread_buffer.dyn_buffer.buf, thread_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&thread_buffer);
}
fz_drop_stream(fzctx, stream);
fz_drop_document(fzctx, fzdoc);
fz_drop_context(fzctx);
}
static scan_arc_ctx_t arc_ctx = (scan_arc_ctx_t) {.passphrase = {0,}};
void parse_epub_fast(scan_ebook_ctx_t *ctx, vfile_t *f, document_t *doc) {
struct archive *a = NULL;
struct archive_entry *entry = NULL;
arc_data_t arc_data;
text_buffer_t content_buffer = text_buffer_create(ctx->content_size);
if (ctx->tn_size <= 0) {
return;
}
int ret = arc_open(&arc_ctx, f, &a, &arc_data, TRUE);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(f->filepath, "(ebook.c) [%d] %s", ret, archive_error_string(a))
archive_read_free(a);
return;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
struct stat info = *archive_entry_stat(entry);
if (S_ISREG(info.st_mode)) {
const char *utf8_name = archive_entry_pathname_utf8(entry);
const char *file_path = utf8_name == NULL ? archive_entry_pathname(entry) : utf8_name;
char *p = strrchr(file_path, '.');
if (p != NULL && (strcmp(p, ".html") == 0 || (strcmp(p, ".xhtml") == 0))) {
size_t entry_size = archive_entry_size(entry);
void *buf = malloc(entry_size + 1);
size_t read = archive_read_data(a, buf, entry_size);
*(char *) (buf + entry_size) = '\0';
if (read != entry_size) {
const char *err_str = archive_error_string(a);
if (err_str) {
CTX_LOG_ERRORF("ebook.c", "Error while reading entry: %s", err_str)
}
free(buf);
break;
}
ret = text_buffer_append_markup(&content_buffer, buf);
free(buf);
if (ret == TEXT_BUF_FULL) {
break;
}
}
}
}
text_buffer_terminate_string(&content_buffer);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + content_buffer.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, content_buffer.dyn_buffer.buf, content_buffer.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&content_buffer);
archive_read_free(a);
}
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc) {
if (ctx->fast_epub_parse && is_epub(mime_str)) {
parse_epub_fast(ctx, f, doc);
return;
}
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
parse_ebook_mem(ctx, buf, buf_len, mime_str, doc, FALSE);
free(buf);
}

View File

@ -1,30 +0,0 @@
#ifndef SCAN_EBOOK_H
#define SCAN_EBOOK_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
const char *tesseract_lang;
const char *tesseract_path;
pthread_mutex_t mupdf_mutex;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int fast_epub_parse;
float tn_qscale;
} scan_ebook_ctx_t;
void parse_ebook(scan_ebook_ctx_t *ctx, vfile_t *f, const char *mime_str, document_t *doc);
void
parse_ebook_mem(scan_ebook_ctx_t *ctx, void *buf, size_t buf_len, const char *mime_str, document_t *doc, int tn_only);
__always_inline
static int is_epub(const char *mime_string) {
return strcmp(mime_string, "application/epub+zip") == 0;
}
#endif

View File

@ -1,246 +0,0 @@
#include "font.h"
#include <ft2build.h>
#include <freetype/freetype.h>
#include "../util.h"
__thread FT_Library ft_lib = NULL;
typedef struct text_dimensions {
unsigned int width;
unsigned int height;
unsigned int baseline;
} text_dimensions_t;
typedef struct glyph {
int top;
int height;
int width;
int descent;
int ascent;
int advance_width;
unsigned char *pixmap;
} glyph_t;
__always_inline
int kerning_offset(char c, char pc, FT_Face face) {
FT_Vector kerning;
FT_Get_Kerning(face, c, pc, FT_KERNING_DEFAULT, &kerning);
return (int) (kerning.x / 64);
}
__always_inline
glyph_t ft_glyph_to_glyph(FT_GlyphSlot slot) {
glyph_t glyph;
glyph.pixmap = slot->bitmap.buffer;
glyph.width = (int) slot->bitmap.width;
glyph.height = (int) slot->bitmap.rows;
glyph.top = slot->bitmap_top;
glyph.advance_width = (int) slot->advance.x / 64;
glyph.descent = MAX(0, glyph.height - glyph.top);
glyph.ascent = MAX(0, MAX(glyph.top, glyph.height) - glyph.descent);
return glyph;
}
text_dimensions_t text_dimension(char *text, FT_Face face) {
text_dimensions_t dimensions;
dimensions.width = 0;
int num_chars = (int) strlen(text);
unsigned int max_ascent = 0;
int max_descent = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = text[i];
FT_Load_Char(face, c, 0);
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
max_descent = MAX(max_descent, glyph.descent);
max_ascent = MAX(max_ascent, MAX(glyph.height, glyph.ascent));
int kerning_x = kerning_offset(c, pc, face);
dimensions.width += MAX(glyph.advance_width, glyph.width) + kerning_x;
pc = c;
}
dimensions.height = max_ascent + max_descent;
dimensions.baseline = max_descent;
return dimensions;
}
void draw_glyph(glyph_t *glyph, int x, int y, struct text_dimensions text_info, unsigned char *bitmap) {
unsigned int src = 0;
unsigned int dst = y * text_info.width + x;
unsigned int row_offset = text_info.width - glyph->width;
unsigned int buf_len = text_info.width * text_info.height;
for (unsigned int sy = 0; sy < glyph->height; sy++) {
for (unsigned int sx = 0; sx < glyph->width; sx++) {
if (dst < buf_len) {
bitmap[dst] |= glyph->pixmap[src];
}
src++;
dst++;
}
dst += row_offset;
}
}
void bmp_format(dyn_buffer_t *buf, text_dimensions_t dimensions, const unsigned char *bitmap) {
dyn_buffer_write_short(buf, 0x4D42); // Magic
dyn_buffer_write_int(buf, 0); // Size placeholder
dyn_buffer_write_int(buf, 0x5157); //Reserved
dyn_buffer_write_int(buf, 14 + 40 + 256 * 4); // pixels offset
dyn_buffer_write_int(buf, 40); // DIB size
dyn_buffer_write_int(buf, (int) dimensions.width);
dyn_buffer_write_int(buf, (int) dimensions.height);
dyn_buffer_write_short(buf, 1); // Color planes
dyn_buffer_write_short(buf, 8); // bits per pixel
dyn_buffer_write_int(buf, 0); // compression
dyn_buffer_write_int(buf, 0); // Ignored
dyn_buffer_write_int(buf, 3800); // hres
dyn_buffer_write_int(buf, 3800); // vres
dyn_buffer_write_int(buf, 256); // Color count
dyn_buffer_write_int(buf, 0); // Ignored
// RGBA32 Color table (Grayscale)
for (int i = 255; i >= 0; i--) {
dyn_buffer_write_int(buf, i + (i << 8) + (i << 16));
}
// Pixel array: write from bottom to top, with rows padded to multiples of 4-bytes
for (int y = (int) dimensions.height - 1; y >= 0; y--) {
for (unsigned int x = 0; x < dimensions.width; x++) {
dyn_buffer_write_char(buf, (char) bitmap[y * dimensions.width + x]);
}
while (buf->cur % 4 != 0) {
dyn_buffer_write_char(buf, 0);
}
}
// Size
*(int *) ((char *) buf->buf + 2) = buf->cur;
}
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (ft_lib == NULL) {
FT_Init_FreeType(&ft_lib);
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FT_Face face;
FT_Error err = FT_New_Memory_Face(ft_lib, (unsigned char *) buf, (int) buf_len, 0, &face);
if (err != 0) {
CTX_LOG_ERRORF(doc->filepath, "(font.c) FT_New_Memory_Face() returned error code [%d] %s", err,
FT_Error_String(err))
free(buf);
return;
}
char font_name[4096];
if (face->style_name == NULL || (strcmp(face->style_name, "?") == 0)) {
if (face->family_name == NULL) {
strcpy(font_name, "(null)");
} else {
strncpy(font_name, face->family_name, sizeof(font_name));
}
} else {
snprintf(font_name, sizeof(font_name), "%s %s", face->family_name, face->style_name);
}
meta_line_t *meta_name = malloc(sizeof(meta_line_t) + strlen(font_name));
meta_name->key = MetaFontName;
strcpy(meta_name->str_val, font_name);
APPEND_META(doc, meta_name)
if (ctx->enable_tn == TRUE) {
FT_Done_Face(face);
free(buf);
return;
}
int pixel = 64;
int num_chars = (int) strlen(font_name);
err = FT_Set_Pixel_Sizes(face, 0, pixel);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Set_Pixel_Sizes() returned error code [%d] %s", err,
FT_Error_String(err))
FT_Done_Face(face);
free(buf);
return;
}
text_dimensions_t dimensions = text_dimension(font_name, face);
unsigned char *bitmap = calloc(dimensions.width * dimensions.height, 1);
FT_Vector pen;
pen.x = 0;
char pc = 0;
for (int i = 0; i < num_chars; i++) {
char c = font_name[i];
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
c = c >= 'a' && c <= 'z' ? c - 32 : c + 32;
err = FT_Load_Char(face, c, FT_LOAD_NO_HINTING | FT_LOAD_RENDER);
if (err != 0) {
CTX_LOG_WARNINGF(doc->filepath, "(font.c) FT_Load_Char() returned error code [%d] %s", err,
FT_Error_String(err))
continue;
}
}
glyph_t glyph = ft_glyph_to_glyph(face->glyph);
pen.x += kerning_offset(c, pc, face);
if (pen.x <= 0) {
pen.x = ABS(glyph.advance_width - glyph.width);
}
pen.y = dimensions.height - glyph.ascent - dimensions.baseline;
draw_glyph(&glyph, pen.x, pen.y, dimensions, bitmap);
pen.x += glyph.advance_width;
pc = c;
}
dyn_buffer_t bmp_data = dyn_buffer_create();
bmp_format(&bmp_data, dimensions, bitmap);
APPEND_TN_META(doc, dimensions.width, dimensions.height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) bmp_data.buf, bmp_data.cur);
dyn_buffer_destroy(&bmp_data);
free(bitmap);
FT_Done_Face(face);
free(buf);
}
void cleanup_font() {
FT_Done_FreeType(ft_lib);
}

View File

@ -1,17 +0,0 @@
#ifndef SCAN_FONT_H
#define SCAN_FONT_H
#include "../scan.h"
typedef struct {
int enable_tn;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_font_ctx_t;
void parse_font(scan_font_ctx_t *ctx, vfile_t *f, document_t *doc);
void cleanup_font();
#endif

View File

@ -1,119 +0,0 @@
#include "json.h"
#include "cjson/cJSON.h"
#define JSON_MAX_FILE_SIZE (1024 * 1024 * 50)
int json_extract_text(cJSON *json, text_buffer_t *tex) {
if (cJSON_IsObject(json)) {
for (cJSON *child = json->child; child != NULL; child = child->next) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsArray(json)) {
cJSON *child;
cJSON_ArrayForEach(child, json) {
if (json_extract_text(child, tex)) {
return TRUE;
}
}
} else if (cJSON_IsString(json)) {
if (text_buffer_append_string0(tex, json->valuestring) == TEXT_BUF_FULL) {
return TRUE;
}
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
return TRUE;
}
}
return FALSE;
}
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
if (f->info.st_size > JSON_MAX_FILE_SIZE) {
CTX_LOG_WARNINGF("json.c", "File larger than maximum allowed [%s]", f->filepath)
return SCAN_ERR_SKIP;
}
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
return SCAN_ERR_READ;
}
buf_len += 1;
buf = realloc(buf, buf_len);
*(buf + buf_len - 1) = '\0';
cJSON *json = cJSON_ParseWithOpts(buf, NULL, TRUE);
text_buffer_t tex = text_buffer_create(ctx->content_size);
json_extract_text(json, &tex);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
cJSON_Delete(json);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define JSON_BUF_SIZE (1024 * 1024 * 5)
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc) {
char *buf = calloc(JSON_BUF_SIZE + 1, sizeof(char));
*(buf + JSON_BUF_SIZE) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
size_t ret;
int eof = FALSE;
const char *parse_end = buf;
size_t to_read;
char *ptr = buf;
while (TRUE) {
cJSON *json;
if (!eof) {
to_read = parse_end == buf ? JSON_BUF_SIZE : parse_end - buf;
ret = f->read(f, ptr, to_read);
if (ret != to_read) {
eof = TRUE;
}
}
json = cJSON_ParseWithOpts(buf, &parse_end, FALSE);
if (parse_end == buf + JSON_BUF_SIZE) {
CTX_LOG_ERRORF("json.c", "Line too large for buffer [%s]", doc->filepath);
cJSON_Delete(json);
break;
}
if (parse_end == buf) {
cJSON_Delete(json);
break;
}
json_extract_text(json, &tex);
cJSON_Delete(json);
memmove(buf, parse_end, (buf + JSON_BUF_SIZE - parse_end));
ptr = buf + JSON_BUF_SIZE - parse_end + buf;
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
}

View File

@ -1,30 +0,0 @@
#ifndef SCAN_JSON_H
#define SCAN_JSON_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int json_mime;
unsigned int ndjson_mime;
} scan_json_ctx_t;
scan_code_t parse_json(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_ndjson(scan_json_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_json(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->json_mime;
}
__always_inline
static int is_ndjson(scan_json_ctx_t *ctx, unsigned int mime) {
return mime == ctx->ndjson_mime;
}
#endif

View File

@ -1,62 +0,0 @@
#ifndef FALSE
#define FALSE (0)
#define BOOL int
#endif
#ifndef TRUE
#define TRUE (!FALSE)
#endif
#undef MAX
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#undef MIN
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
#undef ABS
#define ABS(a) (((a) < 0) ? -(a) : (a))
#define SHA1_STR_LENGTH 41
#define SHA1_DIGEST_LENGTH 20
#define APPEND_STR_META(doc, keyname, value) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + strlen(value)); \
meta_str->key = keyname; \
strcpy(meta_str->str_val, value); \
APPEND_META(doc, meta_str)}
#define APPEND_LONG_META(doc, keyname, value) \
{meta_line_t *meta_long = malloc(sizeof(meta_line_t)); \
meta_long->key = keyname; \
meta_long->long_val = value; \
APPEND_META(doc, meta_long)}
#define APPEND_TN_META(doc, width, height) \
{meta_line_t *meta_str = malloc(sizeof(meta_line_t) + 4 + 1 + 4); \
meta_str->key = MetaThumbnail; \
sprintf(meta_str->str_val, "%04d,%04d", width, height); \
APPEND_META(doc, meta_str)}
#define APPEND_META(doc, meta) \
meta->next = NULL;\
if (doc->meta_head == NULL) {\
doc->meta_head = meta;\
doc->meta_tail = doc->meta_head;\
} else {\
doc->meta_tail->next = meta;\
doc->meta_tail = meta;\
}
#define APPEND_UTF8_META(doc, keyname, str) \
text_buffer_t tex = text_buffer_create(-1); \
text_buffer_append_string0(&tex, str); \
text_buffer_terminate_string(&tex); \
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur); \
meta_tag->key = keyname; \
strcpy(meta_tag->str_val, tex.dyn_buffer.buf); \
APPEND_META(doc, meta_tag) \
text_buffer_destroy(&tex);

View File

@ -1,749 +0,0 @@
#include "media.h"
#include <ctype.h>
#define MIN_SIZE 32
#define AVIO_BUF_SIZE 8192
#define IS_VIDEO(fmt) (fmt->iformat->name && strcmp(fmt->iformat->name, "image2") != 0)
#define STORE_AS_IS ((void*)-1)
const char *get_filepath_with_ext(document_t *doc, const char *filepath, const char *mime_str) {
int has_extension = doc->ext > doc->base;
if (!has_extension) {
if (strcmp(mime_str, "image/png") == 0) {
return "file.png";
} else if (strcmp(mime_str, "image/jpeg") == 0) {
return "file.jpg";
}
}
return filepath;
}
__always_inline
void *scale_frame(const AVCodecContext *decoder, const AVFrame *frame, int size) {
if (frame->pict_type == AV_PICTURE_TYPE_NONE) {
return NULL;
}
int dstW;
int dstH;
if (frame->width <= size && frame->height <= size) {
if (decoder->codec_id == AV_CODEC_ID_MJPEG || decoder->codec_id == AV_CODEC_ID_PNG) {
return STORE_AS_IS;
}
dstW = frame->width;
dstH = frame->height;
} else {
double ratio = (double) frame->width / frame->height;
if (frame->width > frame->height) {
dstW = size;
dstH = (int) (size / ratio);
} else {
dstW = (int) (size * ratio);
dstH = size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return NULL;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
decoder->width, decoder->height, decoder->pix_fmt,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len * 2);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
sws_scale(sws_ctx,
(const uint8_t *const *) frame->data, frame->linesize,
0, decoder->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
return scaled_frame;
}
typedef struct {
AVPacket *packet;
AVFrame *frame;
} frame_and_packet_t;
static void frame_and_packet_free(frame_and_packet_t *frame_and_packet) {
if (frame_and_packet->packet != NULL) {
av_packet_free(&frame_and_packet->packet);
}
if (frame_and_packet->frame != NULL) {
av_frame_free(&frame_and_packet->frame);
}
free(frame_and_packet->packet);
free(frame_and_packet);
}
__always_inline
static void read_subtitles(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, int stream_idx, document_t *doc) {
text_buffer_t tex = text_buffer_create(-1);
AVPacket packet;
AVSubtitle subtitle;
AVCodec *subtitle_codec = avcodec_find_decoder(pFormatCtx->streams[stream_idx]->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(subtitle_codec);
avcodec_parameters_to_context(decoder, pFormatCtx->streams[stream_idx]->codecpar);
avcodec_open2(decoder, subtitle_codec, NULL);
decoder->sub_text_format = FF_SUB_TEXT_FMT_ASS;
int got_sub;
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, &packet);
if (read_frame_ret != 0) {
break;
}
if (packet.stream_index != stream_idx) {
av_packet_unref(&packet);
continue;
}
avcodec_decode_subtitle2(decoder, &subtitle, &got_sub, &packet);
if (got_sub) {
for (int i = 0; i < subtitle.num_rects; i++) {
const char *text = subtitle.rects[i]->ass;
if (text == NULL) {
continue;
}
char *idx = strstr(text, "\\N");
if (idx != NULL && strlen(idx + 2) > 1) {
text_buffer_append_string0(&tex, idx + 2);
text_buffer_append_char(&tex, ' ');
}
}
avsubtitle_free(&subtitle);
}
av_packet_unref(&packet);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
text_buffer_destroy(&tex);
avcodec_free_context(&decoder);
}
__always_inline
static frame_and_packet_t *
read_frame(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVCodecContext *decoder, int stream_idx,
document_t *doc) {
frame_and_packet_t *result = calloc(1, sizeof(frame_and_packet_t));
result->packet = av_packet_alloc();
result->frame = av_frame_alloc();
av_init_packet(result->packet);
int receive_ret = -EAGAIN;
while (receive_ret == -EAGAIN) {
// Get video frame
while (1) {
int read_frame_ret = av_read_frame(pFormatCtx, result->packet);
if (read_frame_ret != 0) {
if (read_frame_ret != AVERROR_EOF) {
CTX_LOG_WARNINGF(doc->filepath,
"(media.c) avcodec_read_frame() returned error code [%d] %s",
read_frame_ret, av_err2str(read_frame_ret)
)
}
frame_and_packet_free(result);
return NULL;
}
//Ignore audio/other frames
if (result->packet->stream_index != stream_idx) {
av_packet_unref(result->packet);
continue;
}
break;
}
// Feed it to decoder
int decode_ret = avcodec_send_packet(decoder, result->packet);
if (decode_ret != 0) {
CTX_LOG_ERRORF(doc->filepath,
"(media.c) avcodec_send_packet() returned error code [%d] %s",
decode_ret, av_err2str(decode_ret)
)
frame_and_packet_free(result);
return NULL;
}
receive_ret = avcodec_receive_frame(decoder, result->frame);
if (receive_ret == -EAGAIN && result->packet != NULL) {
av_packet_unref(result->packet);
}
}
return result;
}
void append_tag_meta_if_not_exists(scan_media_ctx_t *ctx, document_t *doc, AVDictionaryEntry *tag, enum metakey key) {
meta_line_t *meta = doc->meta_head;
while (meta != NULL) {
if (meta->key == key) {
CTX_LOG_DEBUGF(doc->filepath, "Ignoring duplicate tag: '%02x=%s' and '%02x=%s'",
key, meta->str_val, key, tag->value)
return;
}
meta = meta->next;
}
text_buffer_t tex = text_buffer_create(-1);
text_buffer_append_string0(&tex, tag->value);
text_buffer_terminate_string(&tex);
meta_line_t *meta_tag = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_tag->key = key;
strcpy(meta_tag->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta_tag)
text_buffer_destroy(&tex);
}
#define APPEND_TAG_META(keyname) \
APPEND_UTF8_META(doc, keyname, tag->value)
#define STRCPY_TOLOWER(dst, str) \
strncpy(dst, str, sizeof(dst)); \
char *ptr = dst; \
for (; *ptr; ++ptr) *ptr = (char) tolower(*ptr);
__always_inline
static void append_audio_meta(AVFormatContext *pFormatCtx, document_t *doc) {
AVDictionaryEntry *tag = NULL;
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
APPEND_TAG_META(MetaArtist)
} else if (strcmp(key, "genre") == 0) {
APPEND_TAG_META(MetaGenre)
} else if (strcmp(key, "title") == 0) {
APPEND_TAG_META(MetaTitle)
} else if (strcmp(key, "album_artist") == 0) {
APPEND_TAG_META(MetaAlbumArtist)
} else if (strcmp(key, "album") == 0) {
APPEND_TAG_META(MetaAlbum)
} else if (strcmp(key, "comment") == 0) {
APPEND_TAG_META(MetaContent)
}
}
}
__always_inline
static void
append_video_meta(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, AVFrame *frame, document_t *doc, int is_video) {
if (is_video) {
meta_line_t *meta_duration = malloc(sizeof(meta_line_t));
meta_duration->key = MetaMediaDuration;
meta_duration->long_val = pFormatCtx->duration / AV_TIME_BASE;
if (meta_duration->long_val > INT32_MAX) {
meta_duration->long_val = 0;
}
APPEND_META(doc, meta_duration)
meta_line_t *meta_bitrate = malloc(sizeof(meta_line_t));
meta_bitrate->key = MetaMediaBitrate;
meta_bitrate->long_val = pFormatCtx->bit_rate;
APPEND_META(doc, meta_bitrate)
}
AVDictionaryEntry *tag = NULL;
if (is_video) {
while ((tag = av_dict_get(pFormatCtx->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "title") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaTitle);
} else if (strcmp(key, "comment") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaContent);
} else if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
}
}
} else {
// EXIF metadata
while ((tag = av_dict_get(frame->metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) {
char key[256];
STRCPY_TOLOWER(key, tag->key)
if (strcmp(key, "artist") == 0) {
append_tag_meta_if_not_exists(ctx, doc, tag, MetaArtist);
} else if (strcmp(key, "imagedescription") == 0) {
APPEND_TAG_META(MetaContent)
} else if (strcmp(key, "make") == 0) {
APPEND_TAG_META(MetaExifMake)
} else if (strcmp(key, "model") == 0) {
APPEND_TAG_META(MetaExifModel)
} else if (strcmp(key, "software") == 0) {
APPEND_TAG_META(MetaExifSoftware)
} else if (strcmp(key, "fnumber") == 0) {
APPEND_TAG_META(MetaExifFNumber)
} else if (strcmp(key, "focallength") == 0) {
APPEND_TAG_META(MetaExifFocalLength)
} else if (strcmp(key, "usercomment") == 0) {
APPEND_TAG_META(MetaExifUserComment)
} else if (strcmp(key, "isospeedratings") == 0) {
APPEND_TAG_META(MetaExifIsoSpeedRatings)
} else if (strcmp(key, "exposuretime") == 0) {
APPEND_TAG_META(MetaExifExposureTime)
} else if (strcmp(key, "datetime") == 0) {
APPEND_TAG_META(MetaExifDateTime)
} else if (strcmp(key, "gpslatitude") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeDMS)
} else if (strcmp(key, "gpslatituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLatitudeRef)
} else if (strcmp(key, "gpslongitude") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeDMS)
} else if (strcmp(key, "gpslongituderef") == 0) {
APPEND_TAG_META(MetaExifGpsLongitudeRef)
}
}
}
}
void parse_media_format_ctx(scan_media_ctx_t *ctx, AVFormatContext *pFormatCtx, document_t *doc) {
int video_stream = -1;
int audio_stream = -1;
int subtitle_stream = -1;
avformat_find_stream_info(pFormatCtx, NULL);
for (int i = (int) pFormatCtx->nb_streams - 1; i >= 0; i--) {
AVStream *stream = pFormatCtx->streams[i];
if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
if (audio_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaAudioCodec, desc->name)
}
audio_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
if (video_stream == -1) {
const AVCodecDescriptor *desc = avcodec_descriptor_get(stream->codecpar->codec_id);
if (desc != NULL) {
APPEND_STR_META(doc, MetaMediaVideoCodec, desc->name)
}
meta_line_t *meta_w = malloc(sizeof(meta_line_t));
meta_w->key = MetaWidth;
meta_w->long_val = stream->codecpar->width;
APPEND_META(doc, meta_w)
meta_line_t *meta_h = malloc(sizeof(meta_line_t));
meta_h->key = MetaHeight;
meta_h->long_val = stream->codecpar->height;
APPEND_META(doc, meta_h)
video_stream = i;
}
} else if (stream->codecpar->codec_type == AVMEDIA_TYPE_SUBTITLE) {
subtitle_stream = i;
}
}
if (subtitle_stream != -1 && ctx->read_subtitles) {
read_subtitles(ctx, pFormatCtx, subtitle_stream, doc);
// Reset stream
if (video_stream != -1) {
av_seek_frame(pFormatCtx, video_stream, 0, 0);
}
}
if (audio_stream != -1) {
append_audio_meta(pFormatCtx, doc);
}
if (video_stream != -1 && ctx->tn_size > 0) {
AVStream *stream = pFormatCtx->streams[video_stream];
if (stream->codecpar->width <= MIN_SIZE || stream->codecpar->height <= MIN_SIZE) {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
// Decoder
AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
//Seek
if (stream->nb_frames > 1 && stream->codecpar->codec_id != AV_CODEC_ID_GIF) {
int seek_ret;
for (int i = 20; i >= 0; i--) {
seek_ret = av_seek_frame(pFormatCtx, video_stream,
stream->duration * 0.10, 0);
if (seek_ret == 0) {
break;
}
}
}
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, video_stream, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
append_video_meta(ctx, pFormatCtx, frame_and_packet->frame, doc, IS_VIDEO(pFormatCtx));
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
avcodec_free_context(&jpeg_encoder);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
}
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
}
void parse_media_filename(scan_media_ctx_t *ctx, const char *filepath, document_t *doc) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
}
int vfile_read(void *ptr, uint8_t *buf, int buf_size) {
struct vfile *f = ptr;
int ret = f->read(f, buf, buf_size);
if (ret == 0) {
return AVERROR_EOF;
}
return ret;
}
typedef struct {
size_t size;
FILE *file;
void *buf;
} memfile_t;
int memfile_read(void *ptr, uint8_t *buf, int buf_size) {
memfile_t *mem = ptr;
size_t ret = fread(buf, 1, buf_size, mem->file);
if (ret == 0 && feof(mem->file)) {
return AVERROR_EOF;
}
return (int) ret;
}
long memfile_seek(void *ptr, long offset, int whence) {
memfile_t *mem = ptr;
if (whence == 0x10000) {
return mem->size;
}
int ret = fseek(mem->file, offset, whence);
if (ret != 0) {
return AVERROR_EOF;
}
return ftell(mem->file);
}
int memfile_open(vfile_t *f, memfile_t *mem) {
mem->size = f->info.st_size;
mem->buf = malloc(mem->size);
if (mem->buf == NULL) {
return -1;
}
int ret = f->read(f, mem->buf, mem->size);
mem->file = fmemopen(mem->buf, mem->size, "rb");
if (f->calculate_checksum) {
SHA1_Init(&f->sha1_ctx);
safe_sha1_update(&f->sha1_ctx, mem->buf, mem->size);
SHA1_Final(f->sha1_digest, &f->sha1_ctx);
f->has_checksum = TRUE;
}
return (ret == mem->size && mem->file != NULL) ? 0 : -1;
}
int memfile_open_buf(void *buf, size_t buf_len, memfile_t *mem) {
mem->size = (int) buf_len;
mem->buf = buf;
mem->file = fmemopen(mem->buf, mem->size, "rb");
return mem->file != NULL ? 0 : -1;
}
void memfile_close(memfile_t *mem) {
if (mem->buf != NULL) {
free(mem->buf);
fclose(mem->file);
}
}
void parse_media_vfile(scan_media_ctx_t *ctx, struct vfile *f, document_t *doc, const char *mime_str) {
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
AVIOContext *io_ctx = NULL;
memfile_t memfile = {0, 0, 0};
const char *filepath = get_filepath_with_ext(doc, f->filepath, mime_str);
if (f->info.st_size <= ctx->max_media_buffer) {
int ret = memfile_open(f, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(f->filepath, "Loading media file in memory (%ldB)", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
}
}
if (io_ctx == NULL) {
CTX_LOG_DEBUGF(f->filepath, "Reading media file without seek support", f->info.st_size)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, f, vfile_read, NULL, NULL);
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, filepath, NULL, NULL);
if (res < 0) {
if (res != -5) {
CTX_LOG_ERRORF(doc->filepath, "(media.c) avformat_open_input() returned [%d] %s", res, av_err2str(res))
}
av_free(io_ctx->buffer);
memfile_close(&memfile);
avio_context_free(&io_ctx);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
return;
}
parse_media_format_ctx(ctx, pFormatCtx, doc);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
memfile_close(&memfile);
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char *mime_str) {
if (f->is_fs_file) {
parse_media_filename(ctx, f->filepath, doc);
} else {
parse_media_vfile(ctx, f, doc, mime_str);
}
}
void init_media() {
av_log_set_level(AV_LOG_QUIET);
}
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url) {
memfile_t memfile = {0, 0, 0};
AVIOContext *io_ctx = NULL;
AVFormatContext *pFormatCtx = avformat_alloc_context();
if (pFormatCtx == NULL) {
CTX_LOG_ERROR(doc->filepath, "(media.c) Could not allocate context with avformat_alloc_context()")
return FALSE;
}
unsigned char *buffer = (unsigned char *) av_malloc(AVIO_BUF_SIZE);
int ret = memfile_open_buf(buf, buf_len, &memfile);
if (ret == 0) {
CTX_LOG_DEBUGF(doc->filepath, "Loading media file in memory (%ldB)", buf_len)
io_ctx = avio_alloc_context(buffer, AVIO_BUF_SIZE, 0, &memfile, memfile_read, NULL, memfile_seek);
} else {
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
fclose(memfile.file);
return FALSE;
}
pFormatCtx->pb = io_ctx;
int res = avformat_open_input(&pFormatCtx, url, NULL, NULL);
if (res != 0) {
av_free(io_ctx->buffer);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
AVStream *stream = pFormatCtx->streams[0];
// Decoder
const AVCodec *video_codec = avcodec_find_decoder(stream->codecpar->codec_id);
AVCodecContext *decoder = avcodec_alloc_context3(video_codec);
avcodec_parameters_to_context(decoder, stream->codecpar);
avcodec_open2(decoder, video_codec, NULL);
frame_and_packet_t *frame_and_packet = read_frame(ctx, pFormatCtx, decoder, 0, doc);
if (frame_and_packet == NULL) {
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
// Scale frame
AVFrame *scaled_frame = scale_frame(decoder, frame_and_packet->frame, ctx->tn_size);
if (scaled_frame == NULL) {
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return FALSE;
}
if (scaled_frame == STORE_AS_IS) {
APPEND_TN_META(doc, frame_and_packet->frame->width, frame_and_packet->frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) frame_and_packet->packet->data,
frame_and_packet->packet->size);
} else {
// Encode frame to jpeg
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height,
ctx->tn_qscale);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
// Save thumbnail
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
avcodec_free_context(&jpeg_encoder);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
}
frame_and_packet_free(frame_and_packet);
avcodec_free_context(&decoder);
avformat_close_input(&pFormatCtx);
avformat_free_context(pFormatCtx);
av_free(io_ctx->buffer);
avio_context_free(&io_ctx);
fclose(memfile.file);
return TRUE;
}

View File

@ -1,52 +0,0 @@
#ifndef SIST2_MEDIA_H
#define SIST2_MEDIA_H
#include "../scan.h"
#include "libavformat/avformat.h"
#include "libswscale/swscale.h"
#include "libswresample/swresample.h"
#include "libavcodec/avcodec.h"
#include "libavutil/imgutils.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
long max_media_buffer;
int read_subtitles;
} scan_media_ctx_t;
__always_inline
static AVCodecContext *alloc_jpeg_encoder(int w, int h, float qscale) {
const AVCodec *jpeg_codec = avcodec_find_encoder(AV_CODEC_ID_MJPEG);
AVCodecContext *jpeg = avcodec_alloc_context3(jpeg_codec);
jpeg->width = w;
jpeg->height = h;
jpeg->time_base.den = 1000000;
jpeg->time_base.num = 1;
jpeg->i_quant_factor = qscale;
jpeg->pix_fmt = AV_PIX_FMT_YUVJ420P;
int ret = avcodec_open2(jpeg, jpeg_codec, NULL);
if (ret != 0) {
return NULL;
}
return jpeg;
}
void parse_media(scan_media_ctx_t *ctx, vfile_t *f, document_t *doc, const char*mime_str);
void init_media();
int store_image_thumbnail(scan_media_ctx_t *ctx, void *buf, size_t buf_len, document_t *doc, const char *url);
#endif

View File

@ -1,79 +0,0 @@
#include "scan_mobi.h"
#include <mobi.h>
#include <errno.h>
#include "stdlib.h"
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc) {
MOBIData *m = mobi_init();
if (m == NULL) {
CTX_LOG_ERROR(f->filepath, "mobi_init() failed")
return;
}
size_t buf_len;
char* buf = read_all(f, &buf_len);
if (buf == NULL) {
mobi_free(m);
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
MOBI_RET mobi_ret = mobi_load_file(m, file);
fclose(file);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_laod_file() returned error code [%d]", mobi_ret)
return;
}
char *author = mobi_meta_get_author(m);
if (author != NULL) {
APPEND_STR_META(doc, MetaAuthor, author)
free(author);
}
char *title = mobi_meta_get_title(m);
if (title != NULL) {
APPEND_STR_META(doc, MetaTitle, title)
free(title);
}
const size_t maxlen = mobi_get_text_maxsize(m);
if (maxlen == MOBI_NOTSET) {
free(buf);
CTX_LOG_DEBUGF("%s", "Invalid text maxsize: %zu", maxlen)
return;
}
char *content_str = malloc(maxlen + 1);
size_t length = maxlen;
mobi_ret = mobi_get_rawml(m, content_str, &length);
if (mobi_ret != MOBI_SUCCESS) {
mobi_free(m);
free(content_str);
free(buf);
CTX_LOG_ERRORF(f->filepath, "mobi_get_rawml() returned error code [%d]", mobi_ret)
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, content_str);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
free(content_str);
free(buf);
text_buffer_destroy(&tex);
mobi_free(m);
}

View File

@ -1,14 +0,0 @@
#ifndef SCAN_SCAN_MOBI_H
#define SCAN_SCAN_MOBI_H
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_mobi_ctx_t;
void parse_mobi(scan_mobi_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -1,147 +0,0 @@
#include "msdoc.h"
#include <errno.h>
#include <sys/mman.h>
#include "../../third-party/antiword/src/antiword.h"
#include "../ebook/ebook.h"
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void *buf, size_t buf_len) {
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_text;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_utf_8;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file_in, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file_in);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
fclose(file_in);
return;
}
iInitDocument(file_in, (int) buf_len);
const char *author = szGetAuthor();
if (author != NULL) {
APPEND_UTF8_META(doc, MetaAuthor, author)
}
const char *title = szGetTitle();
if (title != NULL) {
APPEND_UTF8_META(doc, MetaTitle, title)
}
vFreeDocument();
bWordDecryptor(file_in, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
if (buf_len > 0) {
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_string(&tex, out_buf, out_len);
text_buffer_terminate_string(&tex);
meta_line_t *meta_content = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta_content->key = MetaContent;
memcpy(meta_content->str_val, tex.dyn_buffer.buf, tex.dyn_buffer.cur);
APPEND_META(doc, meta_content)
text_buffer_destroy(&tex);
}
free(buf);
free(out_buf);
}
void parse_msdoc_pdf(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file, void *buf, size_t buf_len) {
scan_ebook_ctx_t ebook_ctx = {
.content_size = ctx->content_size,
.tn_size = ctx->tn_size,
.log = ctx->log,
.logf = ctx->logf,
.store = ctx->store,
};
// Open word doc
options_type *opts = direct_vGetOptions();
opts->iParagraphBreak = 74;
opts->eConversionType = conversion_pdf;
opts->bHideHiddenText = 1;
opts->bRemoveRemovedText = 1;
opts->bUseLandscape = 0;
opts->eEncoding = encoding_latin_1;
opts->iPageHeight = 842; // A4
opts->iPageWidth = 595;
opts->eImageLevel = level_ps_3;
int doc_word_version = iGuessVersionNumber(file, (int) buf_len);
if (doc_word_version < 0 || doc_word_version == 3) {
free(buf);
return;
}
rewind(file);
size_t out_len;
char *out_buf;
FILE *file_out = open_memstream(&out_buf, &out_len);
diagram_type *diag = pCreateDiagram("antiword", NULL, file_out);
if (diag == NULL) {
return;
}
bWordDecryptor(file, (int) buf_len, diag);
vDestroyDiagram(diag);
fclose(file_out);
parse_ebook_mem(&ebook_ctx, out_buf, out_len, "application/pdf", doc, TRUE);
free(buf);
free(out_buf);
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
char *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
FILE *file = fmemopen(buf, buf_len, "rb");
if (file == NULL) {
free(buf);
CTX_LOG_ERRORF(f->filepath, "fmemopen() failed (%d)", errno)
return;
}
if (ctx->tn_size > 0) {
char *buf_pdf = malloc(buf_len);
memcpy(buf_pdf, buf, buf_len);
parse_msdoc_pdf(ctx, doc, file, buf_pdf, buf_len);
}
parse_msdoc_text(ctx, doc, file, buf, buf_len);
fclose(file);
}

View File

@ -1,24 +0,0 @@
#ifndef SCAN_SCAN_MSDOC_H
#define SCAN_SCAN_MSDOC_H
#include "../scan.h"
typedef struct {
long content_size;
int tn_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
unsigned int msdoc_mime;
} scan_msdoc_ctx_t;
__always_inline
static int is_msdoc(scan_msdoc_ctx_t *ctx, unsigned int mime) {
return mime == ctx->msdoc_mime;
}
void parse_msdoc(scan_msdoc_ctx_t *ctx, vfile_t *f, document_t *doc);
void parse_msdoc_text(scan_msdoc_ctx_t *ctx, document_t *doc, FILE *file_in, void* buf, size_t buf_len);
#endif

View File

@ -1,260 +0,0 @@
#include "ooxml.h"
#include <archive.h>
#include <archive_entry.h>
#include <libxml/xmlstring.h>
#include <libxml/parser.h>
#define _X(str) ((const xmlChar*)str)
__always_inline
static int should_read_part(const char *part) {
if (part == NULL) {
return FALSE;
}
if ( // Word
STR_STARTS_WITH(part, "word/document.xml")
|| STR_STARTS_WITH(part, "word/footnotes.xml")
|| STR_STARTS_WITH(part, "word/endnotes.xml")
|| STR_STARTS_WITH(part, "word/footer")
|| STR_STARTS_WITH(part, "word/header")
// PowerPoint
|| STR_STARTS_WITH(part, "ppt/slides/slide")
|| STR_STARTS_WITH(part, "ppt/notesSlides/slide")
// Excel
|| STR_STARTS_WITH(part, "xl/worksheets/sheet")
|| STR_STARTS_WITH(part, "xl/sharedStrings.xml")
|| STR_STARTS_WITH(part, "xl/workbook.xml")
) {
return TRUE;
}
return FALSE;
}
int extract_text(scan_ooxml_ctx_t *ctx, xmlDoc *xml, xmlNode *node, text_buffer_t *buf) {
//TODO: Check which nodes are likely to have a 't' child, and ignore nodes that aren't
xmlErrorPtr err = xmlGetLastError();
if (err != NULL) {
if (err->level == XML_ERR_FATAL) {
CTX_LOG_ERRORF("ooxml.c", "Got fatal XML error while parsing document: %s", err->message)
return -1;
}
}
for (xmlNode *child = node; child; child = child->next) {
if (child->name != NULL && *child->name == 't' && *(child->name + 1) == '\0') {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text) {
int ret = text_buffer_append_string0(buf, (char *) text);
text_buffer_append_char(buf, ' ');
xmlFree(text);
if (ret == TEXT_BUF_FULL) {
return ret;
}
}
}
if (extract_text(ctx, xml, child->children, buf) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
int xml_io_read(void *context, char *buffer, int len) {
struct archive *a = context;
return (int) archive_read_data(a, buffer, len);
}
int xml_io_close(UNUSED(void *context)) {
//noop
return 0;
}
#define READ_PART_ERR (-2)
__always_inline
static int read_part(scan_ooxml_ctx_t *ctx, struct archive *a, text_buffer_t *buf, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return READ_PART_ERR;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return READ_PART_ERR;
}
int ret = extract_text(ctx, xml, root, buf);
xmlFreeDoc(xml);
return ret;
}
__always_inline
static int read_doc_props_app(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("Properties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("Pages"))) {
APPEND_LONG_META(doc, MetaPages, strtol((char *) text, NULL, 10))
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
__always_inline
static int read_doc_props(scan_ooxml_ctx_t *ctx, struct archive *a, document_t *doc) {
xmlDoc *xml = xmlReadIO(xml_io_read, xml_io_close, a, "/", NULL,
XML_PARSE_RECOVER | XML_PARSE_NOWARNING | XML_PARSE_NOERROR | XML_PARSE_NONET);
if (xml == NULL) {
CTX_LOG_ERROR(doc->filepath, "Could not parse XML")
return -1;
}
xmlNode *root = xmlDocGetRootElement(xml);
if (root == NULL) {
CTX_LOG_ERROR(doc->filepath, "Empty document")
xmlFreeDoc(xml);
return -1;
}
if (xmlStrEqual(root->name, _X("coreProperties"))) {
for (xmlNode *child = root->children; child; child = child->next) {
xmlChar *text = xmlNodeListGetString(xml, child->xmlChildrenNode, 1);
if (text == NULL) {
continue;
}
if (xmlStrEqual(child->name, _X("title"))) {
APPEND_STR_META(doc, MetaTitle, (char *) text)
} else if (xmlStrEqual(child->name, _X("creator"))) {
APPEND_STR_META(doc, MetaAuthor, (char *) text)
} else if (xmlStrEqual(child->name, _X("lastModifiedBy"))) {
APPEND_STR_META(doc, MetaModifiedBy, (char *) text)
}
xmlFree(text);
}
}
xmlFreeDoc(xml);
return 0;
}
#define MAX_TN_SIZE (1024 * 1024 * 15)
void read_thumbnail(scan_ooxml_ctx_t *ctx, document_t *doc, struct archive *a, struct archive_entry *entry) {
size_t entry_size = archive_entry_size(entry);
if (entry_size <= 0 || entry_size > MAX_TN_SIZE) {
return;
}
char *buf = malloc(entry_size);
archive_read_data(a, buf, entry_size);
APPEND_TN_META(doc, 1, 1) // Size unknown
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), buf, entry_size);
free(buf);
}
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
struct archive *a = archive_read_new();
archive_read_support_format_zip(a);
int ret = archive_read_open_memory(a, buf, buf_len);
if (ret != ARCHIVE_OK) {
CTX_LOG_ERRORF(doc->filepath, "Could not read archive: %s", archive_error_string(a))
archive_read_free(a);
free(buf);
return;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
struct archive_entry *entry;
int buffer_full = FALSE;
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
if (S_ISREG(archive_entry_stat(entry)->st_mode)) {
const char *path = archive_entry_pathname(entry);
if (!buffer_full && should_read_part(path) && ctx->content_size > 0) {
ret = read_part(ctx, a, &tex, doc);
if (ret == READ_PART_ERR) {
break;
} else if (ret == TEXT_BUF_FULL) {
buffer_full = TRUE;
}
} else if (strcmp(path, "docProps/app.xml") == 0) {
if (read_doc_props_app(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/core.xml") == 0) {
if (read_doc_props(ctx, a, doc) != 0) {
break;
}
} else if (strcmp(path, "docProps/thumbnail.jpeg") == 0) {
read_thumbnail(ctx, doc, a, entry);
}
}
}
if (tex.dyn_buffer.cur > 0) {
text_buffer_terminate_string(&tex);
meta_line_t *meta = malloc(sizeof(meta_line_t) + tex.dyn_buffer.cur);
meta->key = MetaContent;
strcpy(meta->str_val, tex.dyn_buffer.buf);
APPEND_META(doc, meta)
}
archive_read_close(a);
archive_read_free(a);
text_buffer_destroy(&tex);
free(buf);
}

View File

@ -1,16 +0,0 @@
#ifndef SCAN_OOXML_H
#define SCAN_OOXML_H
#include <stdlib.h>
#include "../scan.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
} scan_ooxml_ctx_t;
void parse_ooxml(scan_ooxml_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

@ -1,218 +0,0 @@
#include "raw.h"
#include <libraw/libraw.h>
#include "../media/media.h"
#include <unistd.h>
#define MIN_SIZE 32
int store_thumbnail_jpeg(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
return store_image_thumbnail((scan_media_ctx_t *) ctx, img->data, img->data_size, doc, "x.jpeg");
}
int store_thumbnail_rgb24(scan_raw_ctx_t *ctx, libraw_processed_image_t *img, document_t *doc) {
int dstW;
int dstH;
if (img->width <= ctx->tn_size && img->height <= ctx->tn_size) {
dstW = img->width;
dstH = img->height;
} else {
double ratio = (double) img->width / img->height;
if (img->width > img->height) {
dstW = ctx->tn_size;
dstH = (int) (ctx->tn_size / ratio);
} else {
dstW = (int) (ctx->tn_size * ratio);
dstH = ctx->tn_size;
}
}
if (dstW <= MIN_SIZE || dstH <= MIN_SIZE) {
return FALSE;
}
AVFrame *scaled_frame = av_frame_alloc();
struct SwsContext *sws_ctx = sws_getContext(
img->width, img->height, AV_PIX_FMT_RGB24,
dstW, dstH, AV_PIX_FMT_YUVJ420P,
SIST_SWS_ALGO, 0, 0, 0
);
int dst_buf_len = av_image_get_buffer_size(AV_PIX_FMT_YUV420P, dstW, dstH, 1);
uint8_t *dst_buf = (uint8_t *) av_malloc(dst_buf_len);
av_image_fill_arrays(scaled_frame->data, scaled_frame->linesize, dst_buf, AV_PIX_FMT_YUV420P, dstW, dstH, 1);
const uint8_t *in_data[1] = {img->data};
int in_line_size[1] = {3 * img->width};
sws_scale(sws_ctx,
in_data, in_line_size,
0, img->height,
scaled_frame->data, scaled_frame->linesize
);
scaled_frame->width = dstW;
scaled_frame->height = dstH;
scaled_frame->format = AV_PIX_FMT_YUV420P;
sws_freeContext(sws_ctx);
AVCodecContext *jpeg_encoder = alloc_jpeg_encoder(scaled_frame->width, scaled_frame->height, 1.0f);
avcodec_send_frame(jpeg_encoder, scaled_frame);
AVPacket jpeg_packet;
av_init_packet(&jpeg_packet);
avcodec_receive_packet(jpeg_encoder, &jpeg_packet);
APPEND_TN_META(doc, scaled_frame->width, scaled_frame->height)
ctx->store((char *) doc->path_md5, sizeof(doc->path_md5), (char *) jpeg_packet.data, jpeg_packet.size);
av_packet_unref(&jpeg_packet);
av_free(*scaled_frame->data);
av_frame_free(&scaled_frame);
avcodec_free_context(&jpeg_encoder);
return TRUE;
}
#define DMS_REF(ref) (((ref) == 'S' || (ref) == 'W') ? -1 : 1)
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc) {
libraw_data_t *libraw_lib = libraw_init(0);
if (!libraw_lib) {
CTX_LOG_ERROR("raw.c", "Cannot create libraw handle")
return;
}
size_t buf_len = 0;
void *buf = read_all(f, &buf_len);
if (buf == NULL) {
CTX_LOG_ERROR(f->filepath, "read_all() failed")
return;
}
int ret = libraw_open_buffer(libraw_lib, buf, buf_len);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not open raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
if (*libraw_lib->idata.model != '\0') {
APPEND_STR_META(doc, MetaExifModel, libraw_lib->idata.model)
}
if (*libraw_lib->idata.make != '\0') {
APPEND_STR_META(doc, MetaExifMake, libraw_lib->idata.make)
}
if (*libraw_lib->idata.software != '\0') {
APPEND_STR_META(doc, MetaExifSoftware, libraw_lib->idata.software)
}
APPEND_LONG_META(doc, MetaWidth, libraw_lib->sizes.width)
APPEND_LONG_META(doc, MetaHeight, libraw_lib->sizes.height)
char tmp[1024];
snprintf(tmp, sizeof(tmp), "%g", libraw_lib->other.iso_speed);
APPEND_STR_META(doc, MetaExifIsoSpeedRatings, tmp)
if (*libraw_lib->other.desc != '\0') {
APPEND_STR_META(doc, MetaContent, libraw_lib->other.desc)
}
if (*libraw_lib->other.artist != '\0') {
APPEND_STR_META(doc, MetaArtist, libraw_lib->other.artist)
}
struct tm *time = localtime(&libraw_lib->other.timestamp);
strftime(tmp, sizeof(tmp), "%Y:%m:%d %H:%M:%S", time);
APPEND_STR_META(doc, MetaExifDateTime, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.focal_len);
APPEND_STR_META(doc, MetaExifFocalLength, tmp)
snprintf(tmp, sizeof(tmp), "%.1f", libraw_lib->other.aperture);
APPEND_STR_META(doc, MetaExifFNumber, tmp)
int denominator = (int) roundf(1 / libraw_lib->other.shutter);
snprintf(tmp, sizeof(tmp), "1/%d", denominator);
APPEND_STR_META(doc, MetaExifExposureTime, tmp)
libraw_gps_info_t gps = libraw_lib->other.parsed_gps;
double gps_longitude_dec =
(gps.longtitude[0] + gps.longtitude[1] / 60 + gps.longtitude[2] / 3600) * DMS_REF(gps.longref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_longitude_dec);
if (gps_longitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLongitudeDec, tmp)
}
double gps_latitude_dec = (gps.latitude[0] + gps.latitude[1] / 60 + gps.latitude[2] / 3600) * DMS_REF(gps.latref);
snprintf(tmp, sizeof(tmp), "%.15f", gps_latitude_dec);
if (gps_latitude_dec != 0.0) {
APPEND_STR_META(doc, MetaExifGpsLatitudeDec, tmp)
}
APPEND_STR_META(doc, MetaMediaVideoCodec, "raw")
if (ctx->tn_size <= 0) {
free(buf);
libraw_close(libraw_lib);
return;
}
libraw_unpack_thumb(libraw_lib);
int errc = 0;
libraw_processed_image_t *thumb = libraw_dcraw_make_mem_thumb(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(thumb);
libraw_close(libraw_lib);
return;
}
int tn_ok = 0;
if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_JPEG) {
tn_ok = store_thumbnail_jpeg(ctx, thumb, doc);
} else if (libraw_lib->thumbnail.tformat == LIBRAW_THUMBNAIL_BITMAP) {
// TODO: technically this should work but is currently untested
tn_ok = store_thumbnail_rgb24(ctx, thumb, doc);
}
libraw_dcraw_clear_mem(thumb);
if (tn_ok == TRUE) {
free(buf);
libraw_close(libraw_lib);
return;
}
ret = libraw_unpack(libraw_lib);
if (ret != 0) {
CTX_LOG_ERROR(f->filepath, "Could not unpack raw file")
free(buf);
libraw_close(libraw_lib);
return;
}
libraw_dcraw_process(libraw_lib);
errc = 0;
libraw_processed_image_t *img = libraw_dcraw_make_mem_image(libraw_lib, &errc);
if (errc != 0) {
free(buf);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
return;
}
store_thumbnail_rgb24(ctx, img, doc);
libraw_dcraw_clear_mem(img);
libraw_close(libraw_lib);
free(buf);
}

View File

@ -1,17 +0,0 @@
#ifndef SIST2_RAW_H
#define SIST2_RAW_H
#include "../scan.h"
typedef struct {
log_callback_t log;
logf_callback_t logf;
store_callback_t store;
int tn_size;
float tn_qscale;
} scan_raw_ctx_t;
void parse_raw(scan_raw_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif //SIST2_RAW_H

View File

@ -1,170 +0,0 @@
#ifndef SCAN_SCAN_H
#define SCAN_SCAN_H
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <stdio.h>
#include <sys/stat.h>
#include <openssl/md5.h>
#include <openssl/sha.h>
#include "macros.h"
#define SIST_SWS_ALGO SWS_LANCZOS
#define UNUSED(x) __attribute__((__unused__)) x
typedef void (*store_callback_t)(char *key, size_t key_len, char *buf, size_t buf_len);
typedef void (*logf_callback_t)(const char *filepath, int level, char *format, ...);
typedef void (*log_callback_t)(const char *filepath, int level, char *str);
typedef int scan_code_t;
#define SCAN_OK (scan_code_t) 0
#define SCAN_ERR_READ (scan_code_t) (-1)
#define SCAN_ERR_SKIP (scan_code_t) (-2)
#define LEVEL_DEBUG 0
#define LEVEL_INFO 1
#define LEVEL_WARNING 2
#define LEVEL_ERROR 3
#define LEVEL_FATAL 4
#define CTX_LOG_DEBUGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_DEBUG, fmt, __VA_ARGS__);
#define CTX_LOG_DEBUG(filepath, str) ctx->log(filepath, LEVEL_DEBUG, str);
#define CTX_LOG_INFOF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_INFO, fmt, __VA_ARGS__);
#define CTX_LOG_INFO(filepath, str) ctx->log(filepath, LEVEL_INFO, str);
#define CTX_LOG_WARNINGF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_WARNING, fmt, __VA_ARGS__);
#define CTX_LOG_WARNING(filepath, str) ctx->log(filepath, LEVEL_WARNING, str);
#define CTX_LOG_ERRORF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_ERROR, fmt, __VA_ARGS__);
#define CTX_LOG_ERROR(filepath, str) ctx->log(filepath, LEVEL_ERROR, str);
#define CTX_LOG_FATALF(filepath, fmt, ...) ctx->logf(filepath, LEVEL_FATAL, fmt, __VA_ARGS__); exit(-1);
#define CTX_LOG_FATAL(filepath, str) ctx->log(filepath, LEVEL_FATAL, str); exit(-1);
enum metakey {
// String
MetaContent = 1,
MetaMediaAudioCodec,
MetaMediaVideoCodec,
MetaArtist,
MetaAlbum,
MetaAlbumArtist,
MetaGenre,
MetaTitle,
MetaFontName,
MetaParent,
MetaExifMake,
MetaExifSoftware,
MetaExifExposureTime,
MetaExifFNumber,
MetaExifFocalLength,
MetaExifUserComment,
MetaExifModel,
MetaExifIsoSpeedRatings,
MetaExifDateTime,
MetaAuthor,
MetaModifiedBy,
MetaThumbnail,
MetaChecksum,
// Number
MetaWidth,
MetaHeight,
MetaMediaDuration,
MetaMediaBitrate,
MetaPages,
// ??
MetaExifGpsLongitudeDMS,
MetaExifGpsLongitudeRef,
MetaExifGpsLatitudeDMS,
MetaExifGpsLatitudeRef,
MetaExifGpsLatitudeDec,
MetaExifGpsLongitudeDec,
};
typedef struct meta_line {
struct meta_line *next;
enum metakey key;
union {
char str_val[0];
unsigned long long_val;
double double_val;
};
} meta_line_t;
typedef struct document {
unsigned char path_md5[MD5_DIGEST_LENGTH];
unsigned long size;
unsigned int mime;
int mtime;
short base;
short ext;
char has_parent;
meta_line_t *meta_head;
meta_line_t *meta_tail;
char *filepath;
} document_t;
typedef struct vfile vfile_t;
__attribute__((warn_unused_result))
typedef int (*read_func_t)(struct vfile *, void *buf, size_t size);
__attribute__((warn_unused_result))
typedef long (*seek_func_t)(struct vfile *, long offset, int whence);
typedef void (*close_func_t)(struct vfile *);
typedef void (*reset_func_t)(struct vfile *);
typedef struct vfile {
union {
int fd;
struct archive *arc;
const void *_test_data;
};
int is_fs_file;
int has_checksum;
int calculate_checksum;
const char *filepath;
struct stat info;
SHA_CTX sha1_ctx;
unsigned char sha1_digest[SHA1_DIGEST_LENGTH];
void *rewind_buffer;
int rewind_buffer_size;
int rewind_buffer_cursor;
read_func_t read;
read_func_t read_rewindable;
close_func_t close;
reset_func_t reset;
log_callback_t log;
logf_callback_t logf;
} vfile_t;
typedef struct parse_job_t {
int base;
int ext;
struct vfile vfile;
unsigned char parent[MD5_DIGEST_LENGTH];
char filepath[1];
} parse_job_t;
#include "util.h"
typedef void (*parse_callback_t)(parse_job_t *job);
#endif

View File

@ -1,64 +0,0 @@
#include "text.h"
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(ctx->content_size, f->info.st_size);
if (to_read <= 2) {
return SCAN_OK;
}
char *buf = malloc(to_read);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(ctx->content_size);
if ((*(int16_t*)buf) == (int16_t)0xFFFE) {
text_buffer_append_string16_le(&tex, buf + 2, to_read - 2);
} else if((*(int16_t*)buf) == (int16_t)0xFEFF) {
text_buffer_append_string16_be(&tex, buf + 2, to_read - 2);
} else {
text_buffer_append_string(&tex, buf, to_read);
}
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}
#define MAX_MARKUP_SIZE (1024 * 1024)
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc) {
int to_read = MIN(MAX_MARKUP_SIZE, f->info.st_size);
char *buf = malloc(to_read + 1);
int ret = f->read(f, buf, to_read);
if (ret < 0) {
CTX_LOG_ERRORF(doc->filepath, "read() returned error code: [%d]", ret)
free(buf);
return SCAN_ERR_READ;
}
*(buf + to_read) = '\0';
text_buffer_t tex = text_buffer_create(ctx->content_size);
text_buffer_append_markup(&tex, buf);
text_buffer_terminate_string(&tex);
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf);
free(buf);
text_buffer_destroy(&tex);
return SCAN_OK;
}

View File

@ -1,18 +0,0 @@
#ifndef SCAN_TEXT_H
#define SCAN_TEXT_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
} scan_text_ctx_t;
scan_code_t parse_text(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
scan_code_t parse_markup(scan_text_ctx_t *ctx, vfile_t *f, document_t *doc);
#endif

View File

View File

@ -1,361 +0,0 @@
#ifndef SCAN_UTIL_H
#define SCAN_UTIL_H
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "../third-party/utf8.h/utf8.h"
#include "macros.h"
#define STR_STARTS_WITH(x, y) (strncmp(y, x, sizeof(y) - 1) == 0)
#define TEXT_BUF_FULL (-1)
#define INITIAL_BUF_SIZE (1024 * 16)
#define SHOULD_IGNORE_CHAR(c) !(SHOULD_KEEP_CHAR(c))
#define SHOULD_KEEP_CHAR(c) (\
((c) >= '\'' && (c) <= ';') || \
((c) >= 'A' && (c) <= 'z') || \
((c) > 127 && (c) != 0x00A0 && (c) && (c) != 0xFFFD))
typedef struct dyn_buffer {
char *buf;
size_t cur;
size_t size;
} dyn_buffer_t;
typedef struct text_buffer {
long max_size;
int last_char_was_whitespace;
dyn_buffer_t dyn_buffer;
} text_buffer_t;
static int utf8_validchr2(const char *s) {
if (0x00 == (0x80 & *s)) {
return TRUE;
} else if (0xf0 == (0xf8 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
(0x80 != (0xc0 & s[3]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[4])) {
return FALSE;
}
if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
return FALSE;
}
} else if (0xe0 == (0xf0 & *s)) {
if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
return FALSE;
}
if (0x80 == (0xc0 & s[3])) {
return FALSE;
}
if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
return FALSE;
}
} else if (0xc0 == (0xe0 & *s)) {
if (0x80 != (0xc0 & s[1])) {
return FALSE;
}
if (0x80 == (0xc0 & s[2])) {
return FALSE;
}
if (0 == (0x1e & s[0])) {
return FALSE;
}
} else {
return FALSE;
}
return TRUE;
}
static dyn_buffer_t dyn_buffer_create() {
dyn_buffer_t buf;
buf.size = INITIAL_BUF_SIZE;
buf.cur = 0;
buf.buf = (char *) malloc(INITIAL_BUF_SIZE);
return buf;
}
static void grow_buffer(dyn_buffer_t *buf, size_t size) {
if (buf->cur + size > buf->size) {
do {
buf->size *= 2;
} while (buf->cur + size > buf->size);
buf->buf = (char *) realloc(buf->buf, buf->size);
}
}
static void grow_buffer_small(dyn_buffer_t *buf) {
if (buf->cur + sizeof(long) > buf->size) {
buf->size *= 2;
buf->buf = (char *) realloc(buf->buf, buf->size);
}
}
static void dyn_buffer_write(dyn_buffer_t *buf, const void *data, size_t size) {
grow_buffer(buf, size);
memcpy(buf->buf + buf->cur, data, size);
buf->cur += size;
}
static void dyn_buffer_write_char(dyn_buffer_t *buf, char c) {
grow_buffer_small(buf);
*(buf->buf + buf->cur) = c;
buf->cur += sizeof(c);
}
static void dyn_buffer_write_str(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
dyn_buffer_write_char(buf, '\0');
}
static void dyn_buffer_append_string(dyn_buffer_t *buf, const char *str) {
dyn_buffer_write(buf, str, strlen(str));
}
static void dyn_buffer_write_int(dyn_buffer_t *buf, int d) {
grow_buffer_small(buf);
*(int *) (buf->buf + buf->cur) = d;
buf->cur += sizeof(int);
}
static void dyn_buffer_write_short(dyn_buffer_t *buf, uint16_t s) {
grow_buffer_small(buf);
*(uint16_t *) (buf->buf + buf->cur) = s;
buf->cur += sizeof(uint16_t);
}
static void dyn_buffer_write_long(dyn_buffer_t *buf, unsigned long l) {
grow_buffer_small(buf);
*(unsigned long *) (buf->buf + buf->cur) = l;
buf->cur += sizeof(unsigned long);
}
static void dyn_buffer_destroy(dyn_buffer_t *buf) {
free(buf->buf);
}
static void text_buffer_destroy(text_buffer_t *buf) {
dyn_buffer_destroy(&buf->dyn_buffer);
}
static text_buffer_t text_buffer_create(long max_size) {
text_buffer_t text_buf;
text_buf.dyn_buffer = dyn_buffer_create();
text_buf.max_size = max_size;
text_buf.last_char_was_whitespace = FALSE;
return text_buf;
}
static int text_buffer_append_char(text_buffer_t *buf, int c) {
if (SHOULD_IGNORE_CHAR(c) || c == ' ') {
if (!buf->last_char_was_whitespace && buf->dyn_buffer.cur != 0) {
dyn_buffer_write_char(&buf->dyn_buffer, ' ');
buf->last_char_was_whitespace = TRUE;
if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
return TEXT_BUF_FULL;
}
}
} else {
buf->last_char_was_whitespace = FALSE;
grow_buffer_small(&buf->dyn_buffer);
if (((utf8_int32_t) 0xffffff80 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = (char) c;
} else if (((utf8_int32_t) 0xfffff800 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xc0 | (char) (c >> 6);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else if (((utf8_int32_t) 0xffff0000 & c) == 0) {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xe0 | (char) (c >> 12);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
} else {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0xf0 | (char) (c >> 18);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 12) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) ((c >> 6) & 0x3f);
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur++) = 0x80 | (char) (c & 0x3f);
}
if (buf->max_size > 0 && buf->dyn_buffer.cur > buf->max_size) {
return TEXT_BUF_FULL;
}
}
return 0;
}
static void text_buffer_terminate_string(text_buffer_t *buf) {
if (buf->dyn_buffer.cur > 0 && *(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) == ' ') {
*(buf->dyn_buffer.buf + buf->dyn_buffer.cur - 1) = '\0';
} else {
dyn_buffer_write_char(&buf->dyn_buffer, '\0');
}
}
// Naive UTF16 -> ascii conversion
static int text_buffer_append_string16_le(text_buffer_t *buf, const char *str, size_t len) {
int ret = 0;
for (int i = 1; i < len; i += 2) {
ret = text_buffer_append_char(buf, str[i]);
}
return ret;
}
static int text_buffer_append_string16_be(text_buffer_t *buf, const char *str, size_t len) {
int ret = 0;
for (int i = 0; i < len; i += 2) {
ret = text_buffer_append_char(buf, str[i]);
}
return ret;
}
#define UTF8_END_OF_STRING \
(ptr - str >= len || *ptr == 0 || \
(0xc0 == (0xe0 & *ptr) && ptr - str > len - 2) || \
(0xe0 == (0xf0 & *ptr) && ptr - str > len - 3) || \
(0xf0 == (0xf8 & *ptr) && ptr - str > len - 4))
static int text_buffer_append_string(text_buffer_t *buf, const char *str, size_t len) {
const char *ptr = str;
const char *oldPtr = ptr;
if (str == NULL || UTF8_END_OF_STRING) {
return 0;
}
if (len <= 4) {
for (int i = 0; i < len; i++) {
if (((utf8_int32_t) 0xffffff80 & str[i]) == 0 && SHOULD_KEEP_CHAR(str[i])) {
dyn_buffer_write_char(&buf->dyn_buffer, str[i]);
}
}
return 0;
}
utf8_int32_t c;
char tmp[16] = {0};
do {
ptr = (char *) utf8codepoint(ptr, &c);
*(int *) tmp = 0x00000000;
memcpy(tmp, oldPtr, ptr - oldPtr);
oldPtr = ptr;
if (!utf8_validchr2(tmp)) {
continue;
}
int ret = text_buffer_append_char(buf, c);
if (ret != 0) {
return ret;
}
} while (!UTF8_END_OF_STRING);
return 0;
}
static int text_buffer_append_string0(text_buffer_t *buf, const char *str) {
return text_buffer_append_string(buf, str, strlen(str));
}
static int text_buffer_append_markup(text_buffer_t *buf, const char *markup) {
int tag_open = TRUE;
const char *ptr = markup;
const char *start = markup;
while (*ptr != '\0') {
if (tag_open) {
if (*ptr == '>') {
tag_open = FALSE;
start = ptr + 1;
}
} else {
if (*ptr == '<') {
tag_open = TRUE;
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
}
}
ptr += 1;
}
if (ptr != start) {
if (text_buffer_append_string(buf, start, (ptr - start)) == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
if (text_buffer_append_char(buf, ' ') == TEXT_BUF_FULL) {
return TEXT_BUF_FULL;
}
}
return 0;
}
static void *read_all(vfile_t *f, size_t *size) {
void *buf = malloc(f->info.st_size);
*size = f->read(f, buf, f->info.st_size);
if (*size != f->info.st_size) {
free(buf);
return NULL;
}
return buf;
}
#define STACK_BUFFER_SIZE (size_t)(4096 * 8)
__always_inline
static void safe_sha1_update(SHA_CTX *ctx, void *buf, size_t size) {
unsigned char stack_buf[STACK_BUFFER_SIZE];
void *sha1_buf;
if (size <= STACK_BUFFER_SIZE) {
sha1_buf = stack_buf;
} else {
void *heap_sha1_buf = malloc(size);
sha1_buf = heap_sha1_buf;
}
memcpy(sha1_buf, buf, size);
SHA1_Update(ctx, (const void *) sha1_buf, size);
if (sha1_buf != stack_buf) {
free(sha1_buf);
}
}
#endif

View File

@ -1,200 +0,0 @@
#include "libwpd_c_api.h"
#include "libwpd/libwpd.h"
#include "libwpd/WPXProperty.h"
#include "libwpd-stream/libwpd-stream.h"
class StringDocument : public WPXDocumentInterface {
private:
text_buffer_t *tex;
document_t *doc;
bool is_full;
public:
StringDocument(text_buffer_t *tex, document_t *doc) {
this->tex = tex;
this->doc = doc;
this->is_full = false;
}
void setDocumentMetaData(const WPXPropertyList &propList) override {
WPXPropertyList::Iter propIter(propList);
for (propIter.rewind(); propIter.next();) {
// TODO: Read metadata here ?!
}
}
void endDocument() override {
text_buffer_terminate_string(this->tex);
}
void closeParagraph() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSpan() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void closeSection() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertTab() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertSpace() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertText(const WPXString &text) override {
if (!this->is_full) {
if (text_buffer_append_string0(tex, text.cstr()) == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void insertLineBreak() override {
if (!this->is_full) {
if (text_buffer_append_char(tex, ' ') == TEXT_BUF_FULL) {
this->is_full = true;
};
}
}
void definePageStyle(const WPXPropertyList &propList) override { /* noop */ }
void closePageSpan() override { /* noop */ }
void openHeader(const WPXPropertyList &propList) override { /* noop */ }
void closeHeader() override { /* noop */ }
void openFooter(const WPXPropertyList &propList) override { /* noop */ }
void closeFooter() override { /* noop */ }
void
defineParagraphStyle(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void openParagraph(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void defineCharacterStyle(const WPXPropertyList &propList) override { /* noop */ }
void openSpan(const WPXPropertyList &propList) override { /* noop */ }
void
defineSectionStyle(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openSection(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void insertField(const WPXString &type, const WPXPropertyList &propList) override { /* noop */ }
void defineOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void defineUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openOrderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void openUnorderedListLevel(const WPXPropertyList &propList) override { /* noop */ }
void closeOrderedListLevel() override { /* noop */ }
void closeUnorderedListLevel() override { /* noop */ }
void openListElement(const WPXPropertyList &propList, const WPXPropertyListVector &tabStops) override { /* noop */ }
void closeListElement() override { /* noop */ }
void openFootnote(const WPXPropertyList &propList) override { /* noop */ }
void closeFootnote() override { /* noop */ }
void openEndnote(const WPXPropertyList &propList) override { /* noop */ }
void closeEndnote() override { /* noop */ }
void openComment(const WPXPropertyList &propList) override { /* noop */ }
void closeComment() override { /* noop */ }
void openTextBox(const WPXPropertyList &propList) override { /* noop */ }
void closeTextBox() override { /* noop */ }
void openTable(const WPXPropertyList &propList, const WPXPropertyListVector &columns) override { /* noop */ }
void openTableRow(const WPXPropertyList &propList) override { /* noop */ }
void closeTableRow() override { /* noop */ }
void openTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTableCell() override { /* noop */ }
void insertCoveredTableCell(const WPXPropertyList &propList) override { /* noop */ }
void closeTable() override { /* noop */ }
void openFrame(const WPXPropertyList &propList) override { /* noop */ }
void closeFrame() override { /* noop */ }
void insertBinaryObject(const WPXPropertyList &propList, const WPXBinaryData &data) override { /* noop */ }
void insertEquation(const WPXPropertyList &propList, const WPXString &data) override { /* noop */ }
void openPageSpan(const WPXPropertyList &propList) override { /* noop */ }
void startDocument() override { /* noop */ };
};
wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len) {
auto *input = new WPXStringStream(buf, buf_len);
return input;
}
wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
WPDConfidence confidence = WPDocument::isFileFormatSupported(stream);
return (wpd_confidence_t) confidence;
}
wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc) {
auto *stream = (WPXStringStream *) ptr;
auto myDoc = StringDocument(tex, doc);
WPDResult result2 = WPDocument::parse(stream, &myDoc, nullptr);
return (wpd_result_t) result2;
}
void wpd_memory_stream_destroy(wpd_stream_t ptr) {
auto *stream = (WPXStringStream *) ptr;
delete stream;
}

View File

@ -1,50 +0,0 @@
#ifndef SIST2_LIBWPD_C_API_H
#define SIST2_LIBWPD_C_API_H
#include "stdlib.h"
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include "../scan.h"
#include "../util.h"
#ifdef __cplusplus
};
#endif
typedef void *wpd_stream_t;
typedef enum {
C_WPD_CONFIDENCE_NONE = 0,
C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION,
C_WPD_CONFIDENCE_EXCELLENT
} wpd_confidence_t;
typedef enum {
C_WPD_OK,
C_WPD_FILE_ACCESS_ERROR,
C_WPD_PARSE_ERROR,
C_WPD_UNSUPPORTED_ENCRYPTION_ERROR,
C_WPD_PASSWORD_MISSMATCH_ERROR,
C_WPD_OLE_ERROR,
C_WPD_UNKNOWN_ERROR
} wpd_result_t;
EXTERNC wpd_confidence_t wpd_is_file_format_supported(wpd_stream_t stream);
EXTERNC wpd_stream_t wpd_memory_stream_create(const unsigned char *buf, size_t buf_len);
EXTERNC void wpd_memory_stream_destroy(wpd_stream_t stream);
EXTERNC wpd_result_t wpd_parse(wpd_stream_t ptr, text_buffer_t *tex, document_t *doc);
#endif

View File

@ -1,41 +0,0 @@
#include "wpd.h"
#include "libwpd_c_api.h"
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc) {
size_t buf_len;
void *buf = read_all(f, &buf_len);
void *stream = wpd_memory_stream_create(buf, buf_len);
wpd_confidence_t conf = wpd_is_file_format_supported(stream);
if (conf == C_WPD_CONFIDENCE_SUPPORTED_ENCRYPTION || conf == C_WPD_CONFIDENCE_UNSUPPORTED_ENCRYPTION) {
CTX_LOG_DEBUGF("wpd.c", "File is encrypted! Password-protected WPD files are not supported yet (conf=%d)", conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
if (conf != C_WPD_CONFIDENCE_EXCELLENT) {
CTX_LOG_ERRORF("wpd.c", "Unsupported file format! [%s] (conf=%d)", doc->filepath, conf)
wpd_memory_stream_destroy(stream);
free(buf);
return SCAN_ERR_READ;
}
text_buffer_t tex = text_buffer_create(-1);
wpd_result_t res = wpd_parse(stream, &tex, doc);
if (res != C_WPD_OK) {
CTX_LOG_ERRORF("wpd.c", "Error while parsing WPD file [%s] (%d)",
doc->filepath, res)
}
if (tex.dyn_buffer.cur != 0) {
APPEND_STR_META(doc, MetaContent, tex.dyn_buffer.buf)
}
text_buffer_destroy(&tex);
wpd_memory_stream_destroy(stream);
free(buf);
}

View File

@ -1,23 +0,0 @@
#ifndef SIST2_WPD_H
#define SIST2_WPD_H
#include "../scan.h"
#include "../util.h"
typedef struct {
long content_size;
log_callback_t log;
logf_callback_t logf;
unsigned int wpd_mime;
} scan_wpd_ctx_t;
scan_code_t parse_wpd(scan_wpd_ctx_t *ctx, vfile_t *f, document_t *doc);
__always_inline
static int is_wpd(scan_wpd_ctx_t *ctx, unsigned int mime) {
return mime == ctx->wpd_mime;
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,114 +0,0 @@
#include "test_util.h"
#include <gtest/gtest.h>
#include <unistd.h>
#include <fcntl.h>
#define FILE_NOT_FOUND_ERR "Could not file, did you clone the test files repo?"
int fs_read(struct vfile *f, void *buf, size_t size) {
if (f->fd == -1) {
f->fd = open(f->filepath, O_RDONLY);
if (f->fd == -1) {
return -1;
}
}
return (int) read(f->fd, buf, size);
}
//Note: No out of bounds check
int mem_read(vfile_t *f, void *buf, size_t size) {
memcpy(buf, f->_test_data, size);
f->_test_data = (char *) f->_test_data + size;
return 0;
}
void fs_close(vfile_t *f) {
if (f->fd != -1) {
close(f->fd);
}
}
void load_doc_file(const char *filepath, vfile_t *f, document_t *doc) {
doc->meta_head = nullptr;
doc->meta_tail = nullptr;
load_file(filepath, f);
}
void load_doc_mem(void *mem, size_t mem_len, vfile_t *f, document_t *doc) {
doc->meta_head = nullptr;
doc->meta_tail = nullptr;
load_mem(mem, mem_len, f);
}
void cleanup(document_t *doc, vfile_t *f) {
destroy_doc(doc);
CLOSE_FILE((*f))
}
void load_file(const char *filepath, vfile_t *f) {
stat(filepath, &f->info);
f->fd = open(filepath, O_RDONLY);
if (f->fd == -1) {
FAIL() << FILE_NOT_FOUND_ERR;
}
f->filepath = filepath;
f->read = fs_read;
f->close = fs_close;
f->is_fs_file = TRUE;
f->calculate_checksum = TRUE;
f->has_checksum = FALSE;
}
void load_mem(void *mem, size_t size, vfile_t *f) {
f->filepath = "_mem_";
f->_test_data = mem;
f->info.st_size = (int) size;
f->read = mem_read;
f->close = nullptr;
f->is_fs_file = TRUE;
}
meta_line_t *get_meta(document_t *doc, metakey key) {
return get_meta_from(doc->meta_head, key);
}
meta_line_t *get_meta_from(meta_line_t *meta, metakey key) {
while (meta != nullptr) {
if (meta->key == key) {
return meta;
}
meta = meta->next;
}
return nullptr;
}
void destroy_doc(document_t *doc) {
meta_line_t *meta = doc->meta_head;
while (meta != nullptr) {
meta_line_t *tmp = meta;
meta = tmp->next;
free(tmp);
}
}
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p) {
for (int i = 0; i < n; i++) {
size_t offset = rand() % (*buf_len - width - 1);
if (rand() % 100 < trunc_p) {
*buf_len = MAX(offset, 1000);
continue;
}
for (int disp = 0; disp < width; disp++) {
buf[offset + disp] = (int8_t) rand();
}
}
}

View File

@ -1,46 +0,0 @@
#ifndef SCAN_TEST_UTIL_H
#define SCAN_TEST_UTIL_H
#include "../libscan/scan.h"
#include <fcntl.h>
#include <unistd.h>
void load_file(const char *filepath, vfile_t *f);
void load_mem(void *mem, size_t size, vfile_t *f);
void load_doc_mem(void *mem, size_t mem_len, vfile_t *f, document_t *doc);
void load_doc_file(const char *filepath, vfile_t *f, document_t *doc);
void cleanup(document_t *doc, vfile_t *f);
static void noop_logf(const char *filepath, int level, char *format, ...) {
// noop
}
static void noop_log(const char *filepath, int level, char *str) {
// noop
}
static size_t store_size = 0;
static void counter_store(char* key, size_t key_len, char *value, size_t value_len) {
store_size += value_len;
// char id[37];
// char tmp[PATH_MAX];
// uuid_unparse(reinterpret_cast<const unsigned char *>(key), id);
// sprintf(tmp, "%s.jpeg", id);
// int fd = open(tmp, O_TRUNC|O_WRONLY|O_CREAT, 0777);
// write(fd, value, value_len);
// close(fd);
}
meta_line_t *get_meta(document_t *doc, metakey key);
meta_line_t *get_meta_from(meta_line_t *meta, metakey key);
#define CLOSE_FILE(f) if (f.close != NULL) {f.close(&f);};
void destroy_doc(document_t *doc);
void fuzz_buffer(char *buf, size_t *buf_len, int width, int n, int trunc_p);
#endif

@ -1 +0,0 @@
Subproject commit 62ae66db99e9dd88dfa31999f516f71bb8bdc8b2

@ -1 +0,0 @@
Subproject commit 146be69f88575d753317d8ef13b16f80e0656fc7