Add dependencies so the pytesseract can run within docker
This commit is contained in:
parent
f6f2d713db
commit
c2680ddca5
40
Dockerfile
40
Dockerfile
|
@ -1,7 +1,41 @@
|
||||||
FROM alpine:3.15
|
FROM python:3.11.0a5-alpine3.15
|
||||||
|
|
||||||
RUN apk add firefox xvfb scrot vips-tools xterm xdotool fluxbox xprop imagemagick bash tesseract-ocr
|
# inspired by https://github.com/darktohka/pytesseract-docker/blob/master/Dockerfile
|
||||||
|
|
||||||
RUN apk add terminus-font ttf-inconsolata ttf-dejavu font-noto font-noto ttf-font-awesome font-noto-extra
|
RUN apk update
|
||||||
|
RUN apk add firefox xvfb scrot vips-tools xterm xdotool fluxbox xprop imagemagick bash tesseract-ocr terminus-font ttf-inconsolata ttf-dejavu font-noto font-noto ttf-font-awesome font-noto-extra
|
||||||
|
|
||||||
|
|
||||||
|
ENV SHELL /bin/sh
|
||||||
|
ENV CC /usr/bin/clang
|
||||||
|
ENV CXX /usr/bin/clang++
|
||||||
|
ENV LANG C.UTF-8
|
||||||
|
ENV PYTHONUNBUFFERED 1
|
||||||
|
ENV PIP_DISABLE_PIP_VERSION_CHECK 1
|
||||||
|
ENV PIP_NO_CACHE_DIR 0
|
||||||
|
ENV TESSDATA_PREFIX /usr/local/share/tessdata
|
||||||
|
WORKDIR /tmp
|
||||||
|
|
||||||
|
RUN apk add --no-cache openssl leptonica openjpeg tiff libpng zlib freetype libgcc libstdc++
|
||||||
|
# Install development tools
|
||||||
|
RUN apk add --no-cache --virtual .dev-deps file linux-headers git make automake autoconf libtool pkgconfig clang g++ openssl-dev leptonica-dev openjpeg-dev tiff-dev libpng-dev zlib-dev freetype-dev
|
||||||
|
# Install Tesseract from master
|
||||||
|
RUN mkdir /usr/local/share/tessdata
|
||||||
|
RUN mkdir tesseract
|
||||||
|
|
||||||
|
WORKDIR /tmp/tesseract
|
||||||
|
|
||||||
|
RUN wget https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata -P "$TESSDATA_PREFIX"
|
||||||
|
RUN git clone --depth 1 https://github.com/tesseract-ocr/tesseract.git .
|
||||||
|
RUN ./autogen.sh
|
||||||
|
RUN ./configure
|
||||||
|
RUN make -j$(nproc)
|
||||||
|
RUN make install
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install -U --no-cache-dir pytesseract
|
||||||
|
# Cleanup
|
||||||
|
RUN apk del .dev-deps
|
||||||
|
RUN rm -f /usr/local/lib/*.a
|
||||||
|
RUN rm -rf /tmp/* /var/cache/apk/*
|
||||||
|
|
||||||
WORKDIR /opt
|
WORKDIR /opt
|
||||||
|
|
15
get-text-position.py
Normal file
15
get-text-position.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytesseract
|
||||||
|
from pytesseract import Output
|
||||||
|
# from PIL import Image
|
||||||
|
img = 'cropped.png'
|
||||||
|
|
||||||
|
d = pytesseract.image_to_data(img, output_type=Output.DICT)
|
||||||
|
# print(d)
|
||||||
|
n_boxes = len(d['level'])
|
||||||
|
for i in range(n_boxes):
|
||||||
|
(x, y, w, h, text) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i], d['text'][i])
|
||||||
|
if text=="":
|
||||||
|
continue
|
||||||
|
print(x, y, repr(text))
|
||||||
|
|
||||||
|
|
23
script3.sh
23
script3.sh
|
@ -48,6 +48,20 @@ keycombo(){
|
||||||
sleep 0.5
|
sleep 0.5
|
||||||
}
|
}
|
||||||
|
|
||||||
|
annotate_header(){
|
||||||
|
header=$1
|
||||||
|
d=$(date "+%Y-%m-%d__%H_%M_%S")
|
||||||
|
filename="$d__$header.png"
|
||||||
|
cropped_filename="$filename__cropped.png"
|
||||||
|
left=2056
|
||||||
|
top=330
|
||||||
|
width=824
|
||||||
|
height=1260
|
||||||
|
scrot $filename
|
||||||
|
vips extract_area "$filename" "$cropped_filename" $left $top $width $height
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
rm -rf /root/.mozilla/firefox/bifup8k5.docker/sessionstore-backups
|
rm -rf /root/.mozilla/firefox/bifup8k5.docker/sessionstore-backups
|
||||||
#echo 'user_pref("layout.css.devPixelsPerPx", "1.5");' >> /root/.mozilla/firefox/bifup8k5.docker/prefs.js
|
#echo 'user_pref("layout.css.devPixelsPerPx", "1.5");' >> /root/.mozilla/firefox/bifup8k5.docker/prefs.js
|
||||||
|
|
||||||
|
@ -131,10 +145,11 @@ do
|
||||||
xdotool key Tab
|
xdotool key Tab
|
||||||
sleep 0.05
|
sleep 0.05
|
||||||
xdotool key Down
|
xdotool key Down
|
||||||
sleep 0.1
|
sleep 0.2
|
||||||
scrot
|
annotate_header set-cookie
|
||||||
echo "########## EXTRACTED TEXT: "
|
# scrot
|
||||||
extract_text 2056 330 824 1260 | grep cookie
|
# echo "########## EXTRACTED TEXT: "
|
||||||
|
# extract_text 2056 330 824 1260 | grep cookie
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user