2022-05-29 22:00:28 +02:00
# test with:
2022-06-15 20:27:15 +02:00
# docker image build -t headless-fox Docker && docker run -v $PWD/static:/opt/static -ti headless-fox time python annotate_header.py screenshot.png "/opt/static/output.png" "content-type" "Tutaj jest content-type" "etag" "Tutaj jest etag z długim opisem co ma wiele słów i wychodzi poza network inspector"
2022-05-29 22:00:28 +02:00
import os
import sys
import pytesseract
import uuid
2022-06-17 11:44:05 +02:00
import json
2022-05-29 22:00:28 +02:00
from pytesseract import Output
2022-06-15 20:27:15 +02:00
from PIL import Image , ImageDraw , ImageFont
2022-05-29 22:00:28 +02:00
2022-06-15 20:27:15 +02:00
output_file_relative = sys . argv [
1
2022-07-08 16:36:48 +02:00
]
2022-06-15 20:27:15 +02:00
output_file = " /opt/static/ " + output_file_relative
2022-07-08 16:36:48 +02:00
output_dir = os . path . dirname ( output_file )
output_suffix = os . path . basename ( output_file )
2022-06-17 11:44:05 +02:00
domain = sys . argv [ 2 ]
needles = sys . argv [ 3 : ]
2022-06-15 20:27:15 +02:00
base_url = os . getenv ( " BASE_URL " )
# generator
def partition ( lst , size ) :
for i in range ( 0 , len ( lst ) , size ) :
yield lst [ i : i + size ]
2022-05-29 22:00:28 +02:00
# print(d)
2022-06-15 20:27:15 +02:00
with Image . open ( output_file ) as im :
x_offset = 2054
y_offset = 313
cropped = im . crop ( ( x_offset , y_offset , 2875 , 1558 ) )
cropped_filename = " /opt/static/ " + uuid . uuid4 ( ) . hex + " .png "
cropped . save ( cropped_filename )
d = pytesseract . image_to_data ( cropped_filename , output_type = Output . DICT )
os . remove ( cropped_filename )
draw = ImageDraw . Draw ( im )
n_boxes = len ( d [ " level " ] )
font = ImageFont . truetype ( " /usr/share/fonts/noto/NotoSansDisplay-Medium.ttf " , 48 )
2022-07-08 10:40:16 +02:00
found_needles = { }
2022-06-15 20:27:15 +02:00
for i in range ( n_boxes ) :
( x , y , w , h , text ) = (
d [ " left " ] [ i ] ,
d [ " top " ] [ i ] ,
d [ " width " ] [ i ] ,
d [ " height " ] [ i ] ,
d [ " text " ] [ i ] ,
)
2022-07-08 10:40:16 +02:00
if abs ( x - 59 ) > 2 :
# it means that this is not a header name
continue
for [ needle , comment , min_value_length , value_must_include ] in partition ( needles , 4 ) :
2022-06-15 20:27:15 +02:00
if needle . lower ( ) in text . lower ( ) :
2022-07-08 10:40:16 +02:00
header_value_chunks = [ ]
for i in range ( n_boxes ) :
( other_x , other_y , other_w , other_h , other_text ) = (
d [ " left " ] [ i ] ,
d [ " top " ] [ i ] ,
d [ " width " ] [ i ] ,
d [ " height " ] [ i ] ,
d [ " text " ] [ i ] ,
)
if abs ( other_x - x ) < = 4 :
# it means that this is a header name
continue
if abs ( other_y - y ) < = 4 :
header_value_chunks . append ( ( other_text , other_x ) )
header_value_chunks . sort ( key = lambda y : y [ 1 ] )
header_value = " "
for chunk in header_value_chunks :
header_value = header_value + chunk [ 0 ]
if len ( header_value ) < int ( min_value_length ) :
continue
if not ( value_must_include . lower ( ) in header_value . lower ( ) ) :
continue
found_needles [ needle ] = header_value
2022-06-15 20:27:15 +02:00
# modify y so it's aligned not with the top of the text, but with the midline
y = y + h / 2
radius = 30
# offset both y and x
2022-07-08 10:40:16 +02:00
print_y = y + y_offset
print_x = x + x_offset
2022-07-08 15:13:15 +02:00
fill = " #ff726b "
2022-06-15 20:27:15 +02:00
line_length = 200
draw . regular_polygon (
2022-07-08 10:40:16 +02:00
( ( print_x - radius - 5 , print_y ) , radius ) , n_sides = 3 , rotation = 270 , fill = fill
2022-06-15 20:27:15 +02:00
)
2022-07-08 10:40:16 +02:00
draw . line ( ( print_x - radius - 5 , print_y , print_x - line_length , print_y ) , fill = fill , width = 10 )
2022-06-15 20:27:15 +02:00
text_w , text_h = draw . textsize ( comment , font )
text_padding = 10
draw . rectangle (
[
2022-07-08 10:40:16 +02:00
( print_x - line_length - text_w - text_padding , print_y - text_h / 2 ) ,
( print_x - line_length + text_padding , print_y + text_h / 2 ) ,
2022-06-15 20:27:15 +02:00
] ,
fill = " white " ,
)
draw . text (
2022-07-08 10:40:16 +02:00
( print_x - line_length - 10 , print_y ) ,
2022-06-15 20:27:15 +02:00
comment ,
fill = fill ,
anchor = " rm " ,
font = font ,
)
2022-07-08 15:12:51 +02:00
if len ( found_needles ) == 0 :
exit ( 0 )
2022-07-08 16:36:48 +02:00
os . remove ( output_file )
2022-06-19 13:52:43 +02:00
im = im . resize ( ( im . width / / 2 , im . height / / 2 ) )
2022-07-08 16:51:33 +02:00
output_filename = domain . replace ( " . " , " _ " ) + " _ " + output_suffix
im . save ( output_dir + " / " + output_filename , " PNG " )
2022-06-17 11:44:05 +02:00
print ( json . dumps ( { " new_file " :
2022-07-08 16:58:21 +02:00
{ " url " : base_url + " /static/ " + os . path . dirname ( output_file_relative ) + " / " + output_filename ,
2022-06-17 11:44:05 +02:00
" domain " : domain ,
2022-07-08 16:51:33 +02:00
" found_headers " : found_needles ,
" filename " : output_filename } } ) )