screenshot-service/Docker/annotate_header.py

114 lines
4.3 KiB
Python
Raw Normal View History

# test with:
2022-06-15 20:27:15 +02:00
# docker image build -t headless-fox Docker && docker run -v $PWD/static:/opt/static -ti headless-fox time python annotate_header.py screenshot.png "/opt/static/output.png" "content-type" "Tutaj jest content-type" "etag" "Tutaj jest etag z długim opisem co ma wiele słów i wychodzi poza network inspector"
import os
import sys
import pytesseract
import uuid
2022-06-17 11:44:05 +02:00
import json
from pytesseract import Output
2022-06-15 20:27:15 +02:00
from PIL import Image, ImageDraw, ImageFont
2022-06-15 20:27:15 +02:00
output_file_relative = sys.argv[
1
] # this is also the existing source screenshot to annotate. It will be updated in-place
output_file = "/opt/static/" + output_file_relative
2022-06-17 11:44:05 +02:00
domain = sys.argv[2]
needles = sys.argv[3:]
2022-06-15 20:27:15 +02:00
base_url = os.getenv("BASE_URL")
# generator
def partition(lst, size):
for i in range(0, len(lst), size):
yield lst[i : i + size]
# print(d)
2022-06-15 20:27:15 +02:00
with Image.open(output_file) as im:
x_offset = 2054
y_offset = 313
cropped = im.crop((x_offset, y_offset, 2875, 1558))
cropped_filename = "/opt/static/" + uuid.uuid4().hex + ".png"
cropped.save(cropped_filename)
d = pytesseract.image_to_data(cropped_filename, output_type=Output.DICT)
os.remove(cropped_filename)
draw = ImageDraw.Draw(im)
n_boxes = len(d["level"])
font = ImageFont.truetype("/usr/share/fonts/noto/NotoSansDisplay-Medium.ttf", 48)
2022-07-08 10:40:16 +02:00
found_needles = {}
2022-06-15 20:27:15 +02:00
for i in range(n_boxes):
(x, y, w, h, text) = (
d["left"][i],
d["top"][i],
d["width"][i],
d["height"][i],
d["text"][i],
)
2022-07-08 10:40:16 +02:00
if abs(x - 59) > 2:
# it means that this is not a header name
continue
for [needle, comment, min_value_length, value_must_include] in partition(needles, 4):
2022-06-15 20:27:15 +02:00
if needle.lower() in text.lower():
2022-07-08 10:40:16 +02:00
header_value_chunks = []
for i in range(n_boxes):
(other_x, other_y, other_w, other_h, other_text) = (
d["left"][i],
d["top"][i],
d["width"][i],
d["height"][i],
d["text"][i],
)
if abs(other_x - x) <= 4:
# it means that this is a header name
continue
if abs(other_y - y) <= 4:
header_value_chunks.append((other_text, other_x))
header_value_chunks.sort(key=lambda y: y[1])
header_value = ""
for chunk in header_value_chunks:
header_value = header_value + chunk[0]
if len(header_value) < int(min_value_length):
continue
if not (value_must_include.lower() in header_value.lower()):
continue
found_needles[needle] = header_value
2022-06-15 20:27:15 +02:00
# modify y so it's aligned not with the top of the text, but with the midline
y = y + h / 2
radius = 30
# offset both y and x
2022-07-08 10:40:16 +02:00
print_y = y + y_offset
print_x = x + x_offset
2022-06-15 20:27:15 +02:00
fill = "red"
line_length = 200
draw.regular_polygon(
2022-07-08 10:40:16 +02:00
((print_x - radius - 5, print_y), radius), n_sides=3, rotation=270, fill=fill
2022-06-15 20:27:15 +02:00
)
2022-07-08 10:40:16 +02:00
draw.line((print_x - radius - 5, print_y, print_x - line_length, print_y), fill=fill, width=10)
2022-06-15 20:27:15 +02:00
text_w, text_h = draw.textsize(comment, font)
text_padding = 10
draw.rectangle(
[
2022-07-08 10:40:16 +02:00
(print_x - line_length - text_w - text_padding, print_y - text_h / 2),
(print_x - line_length + text_padding, print_y + text_h / 2),
2022-06-15 20:27:15 +02:00
],
fill="white",
)
draw.text(
2022-07-08 10:40:16 +02:00
(print_x - line_length - 10, print_y),
2022-06-15 20:27:15 +02:00
comment,
fill=fill,
anchor="rm",
font=font,
)
im = im.resize((im.width // 2, im.height // 2))
2022-06-15 20:27:15 +02:00
im.save(output_file, "PNG")
2022-06-17 11:44:05 +02:00
print(json.dumps({"new_file":
{"url": base_url + "/static/" + output_file_relative,
"domain": domain,
"found_headers": found_needles}}))