feat(paperless): change paperless OCR engine model from tesseract_fast to tesseract_best

This commit is contained in:
2026-05-11 19:45:38 +09:00
parent e1936b494d
commit 1096981ef2
4 changed files with 41 additions and 3 deletions
@@ -57,8 +57,16 @@
- "data/containers/paperless/consume"
- "containers/paperless"
- "containers/paperless/ssl"
- "containers/paperless/build"
become: true
- name: Deploy containerfile for build
ansible.builtin.template:
src: "{{ hostvars['console']['node']['config_path'] }}/services/containers/app/paperless/build/paperless.containerfile.j2"
dest: "{{ node['home_path'] }}/containers/paperless/build/Containerfile"
owner: "{{ ansible_user }}"
group: "svadmins"
mode: "0640"
- name: Deploy root certificate
ansible.builtin.copy:
@@ -72,6 +80,18 @@
notify: "notification_restart_paperless"
no_log: true
- name: Build paperless container image
containers.podman.podman_image:
name: "{{ domain['internal'] }}/{{ node['name'] }}/paperless-ngx"
# check tags from container file
tag: "{{ version['containers']['paperless'] }}"
state: "build"
path: "{{ node['home_path'] }}/containers/paperless/build"
- name: Prune paperless dangling images
containers.podman.podman_prune:
image: true
- name: Register secret value to podman secret
containers.podman.podman_secret:
name: "{{ item.name }}"
@@ -129,8 +149,8 @@
loop:
- image: "docker.io/library/redis:{{ version['containers']['redis'] }}"
file: "docker.io_library_redis_{{ version['containers']['redis'] }}"
- image: "ghcr.io/paperless-ngx/paperless-ngx:{{ version['containers']['paperless'] }}"
file: "ghcr.io_paperless-ngx_paperless-ngx_{{ version['containers']['paperless'] }}"
- image: "ilnmors.internal/{{ node['name'] }}/paperless-ngx:{{ version['containers']['paperless'] }}"
file: "ilnmors.internal_{{ node['name'] }}_paperless-ngx_{{ version['containers']['paperless'] }}"
loop_control:
label: "{{ item.file }}"
register: container_archive_images
@@ -0,0 +1,13 @@
FROM ghcr.io/paperless-ngx/paperless-ngx:{{ version['containers']['paperless'] }}
USER root
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl ca-certificates \
&& curl -fsSL https://raw.githubusercontent.com/tesseract-ocr/tessdata_best/main/kor.traineddata \
-o /usr/share/tesseract-ocr/5/tessdata/kor.traineddata \
&& curl -fsSL https://raw.githubusercontent.com/tesseract-ocr/tessdata_best/main/eng.traineddata \
-o /usr/share/tesseract-ocr/5/tessdata/eng.traineddata \
&& rm -rf /var/lib/apt/lists/*
USER paperless
@@ -8,7 +8,7 @@ After=redis_paperless.service
Wants=redis_paperless.service
[Container]
Image=ghcr.io/paperless-ngx/paperless-ngx:{{ version['containers']['paperless'] }}
Image=ilnmors.internal/app/paperless-ngx:{{ version['containers']['paperless'] }}
ContainerName=paperless
HostName=paperless
PublishPort={{ services['paperless']['ports']['http'] }}:8000/tcp
+5
View File
@@ -45,6 +45,11 @@ ALTER DATABASE paperless_db OWNER TO paperless;
- "paperless"
```
### Paperless custom build
- paperless-ngx uses 'tesseract_fast' model
- building custom container to use 'tesseract_best' model to improve OCR accuracy.
## Configuration
### Access to paperless