screenshot-service/Docker/annotate_header.py

# test with:

# docker image build -t headless-fox Docker && docker run -v $PWD/static:/opt/static -ti headless-fox time python annotate_header.py screenshot.png "/opt/static/output.png" "content-type" "Tutaj jest content-type" "etag" "Tutaj jest etag z długim opisem co ma wiele słów i wychodzi poza network inspector"

import os
import sys
import pytesseract
import uuid
import json
from pytesseract import Output
from PIL import Image, ImageDraw, ImageFont

output_file_relative = sys.argv[
    1
]  # this is also the existing source screenshot to annotate. It will be updated in-place

output_file = "/opt/static/" + output_file_relative
domain = sys.argv[2]
needles = sys.argv[3:]

base_url = os.getenv("BASE_URL")

# generator
def partition(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i : i + size]


# print(d)

with Image.open(output_file) as im:
    x_offset = 2054
    y_offset = 313
    cropped = im.crop((x_offset, y_offset, 2875, 1558))
    cropped_filename = "/opt/static/" + uuid.uuid4().hex + ".png"
    cropped.save(cropped_filename)
    d = pytesseract.image_to_data(cropped_filename, output_type=Output.DICT)
    os.remove(cropped_filename)
    draw = ImageDraw.Draw(im)
    n_boxes = len(d["level"])
    font = ImageFont.truetype("/usr/share/fonts/noto/NotoSansDisplay-Medium.ttf", 48)
    found_needles = {}
    for i in range(n_boxes):
        (x, y, w, h, text) = (
            d["left"][i],
            d["top"][i],
            d["width"][i],
            d["height"][i],
            d["text"][i],
        )
        
        if abs(x - 59) > 2:
            # it means that this is not a header name
            continue
        for [needle, comment, min_value_length, value_must_include] in partition(needles, 4):
            if needle.lower() in text.lower():
                header_value_chunks = []
                for i in range(n_boxes):
                    (other_x, other_y, other_w, other_h, other_text) = (
                        d["left"][i],
                        d["top"][i],
                        d["width"][i],
                        d["height"][i],
                        d["text"][i],
                    )
                    if abs(other_x - x) <= 4:
                        # it means that this is a header name
                        continue
                    if abs(other_y - y) <= 4:
                        header_value_chunks.append((other_text, other_x))
                header_value_chunks.sort(key=lambda y: y[1])
                header_value = ""
                for chunk in header_value_chunks:
                    header_value = header_value + chunk[0]
                if len(header_value) < int(min_value_length):
                    continue
                if not (value_must_include.lower() in header_value.lower()):
                    continue
                found_needles[needle] = header_value
                # modify y so it's aligned not with the top of the text, but with the midline
                y = y + h / 2
                radius = 30
                # offset both y and x
                print_y = y + y_offset
                print_x = x + x_offset
                fill = "red"
                line_length = 200
                draw.regular_polygon(
                    ((print_x - radius - 5, print_y), radius), n_sides=3, rotation=270, fill=fill
                )
                draw.line((print_x - radius - 5, print_y, print_x - line_length, print_y), fill=fill, width=10)
                text_w, text_h = draw.textsize(comment, font)
                text_padding = 10
                draw.rectangle(
                    [
                        (print_x - line_length - text_w - text_padding, print_y - text_h / 2),
                        (print_x - line_length + text_padding, print_y + text_h / 2),
                    ],
                    fill="white",
                )
                draw.text(
                    (print_x - line_length - 10, print_y),
                    comment,
                    fill=fill,
                    anchor="rm",
                    font=font,
                )
    im = im.resize((im.width // 2, im.height // 2))
    im.save(output_file, "PNG")
    print(json.dumps({"new_file":
                      {"url": base_url + "/static/" + output_file_relative,
                       "domain": domain,
                       "found_headers": found_needles}}))
Checkpoint - begin works on faster, more integrated image processing 2022-05-29 22:00:28 +02:00			`# test with:`

Faster image annotation 2022-06-15 20:27:15 +02:00			`# docker image build -t headless-fox Docker && docker run -v $PWD/static:/opt/static -ti headless-fox time python annotate_header.py screenshot.png "/opt/static/output.png" "content-type" "Tutaj jest content-type" "etag" "Tutaj jest etag z długim opisem co ma wiele słów i wychodzi poza network inspector"`
Checkpoint - begin works on faster, more integrated image processing 2022-05-29 22:00:28 +02:00
			`import os`
			`import sys`
			`import pytesseract`
			`import uuid`
Add metadata to screenshots 2022-06-17 11:44:05 +02:00			`import json`
Checkpoint - begin works on faster, more integrated image processing 2022-05-29 22:00:28 +02:00			`from pytesseract import Output`
Faster image annotation 2022-06-15 20:27:15 +02:00			`from PIL import Image, ImageDraw, ImageFont`
Checkpoint - begin works on faster, more integrated image processing 2022-05-29 22:00:28 +02:00
Faster image annotation 2022-06-15 20:27:15 +02:00			`output_file_relative = sys.argv[`
			`1`
			`] # this is also the existing source screenshot to annotate. It will be updated in-place`

			`output_file = "/opt/static/" + output_file_relative`
Add metadata to screenshots 2022-06-17 11:44:05 +02:00			`domain = sys.argv[2]`
			`needles = sys.argv[3:]`
Faster image annotation 2022-06-15 20:27:15 +02:00
			`base_url = os.getenv("BASE_URL")`

			`# generator`
			`def partition(lst, size):`
			`for i in range(0, len(lst), size):`
			`yield lst[i : i + size]`
Checkpoint - begin works on faster, more integrated image processing 2022-05-29 22:00:28 +02:00

			`# print(d)`

Faster image annotation 2022-06-15 20:27:15 +02:00			`with Image.open(output_file) as im:`
			`x_offset = 2054`
			`y_offset = 313`
			`cropped = im.crop((x_offset, y_offset, 2875, 1558))`
			`cropped_filename = "/opt/static/" + uuid.uuid4().hex + ".png"`
			`cropped.save(cropped_filename)`
			`d = pytesseract.image_to_data(cropped_filename, output_type=Output.DICT)`
			`os.remove(cropped_filename)`
			`draw = ImageDraw.Draw(im)`
			`n_boxes = len(d["level"])`
			`font = ImageFont.truetype("/usr/share/fonts/noto/NotoSansDisplay-Medium.ttf", 48)`
Add header values to output 2022-07-08 10:40:16 +02:00			`found_needles = {}`
Faster image annotation 2022-06-15 20:27:15 +02:00			`for i in range(n_boxes):`
			`(x, y, w, h, text) = (`
			`d["left"][i],`
			`d["top"][i],`
			`d["width"][i],`
			`d["height"][i],`
			`d["text"][i],`
			`)`
Add header values to output 2022-07-08 10:40:16 +02:00
			`if abs(x - 59) > 2:`
			`# it means that this is not a header name`
			`continue`
			`for [needle, comment, min_value_length, value_must_include] in partition(needles, 4):`
Faster image annotation 2022-06-15 20:27:15 +02:00			`if needle.lower() in text.lower():`
Add header values to output 2022-07-08 10:40:16 +02:00			`header_value_chunks = []`
			`for i in range(n_boxes):`
			`(other_x, other_y, other_w, other_h, other_text) = (`
			`d["left"][i],`
			`d["top"][i],`
			`d["width"][i],`
			`d["height"][i],`
			`d["text"][i],`
			`)`
			`if abs(other_x - x) <= 4:`
			`# it means that this is a header name`
			`continue`
			`if abs(other_y - y) <= 4:`
			`header_value_chunks.append((other_text, other_x))`
			`header_value_chunks.sort(key=lambda y: y[1])`
			`header_value = ""`
			`for chunk in header_value_chunks:`
			`header_value = header_value + chunk[0]`
			`if len(header_value) < int(min_value_length):`
			`continue`
			`if not (value_must_include.lower() in header_value.lower()):`
			`continue`
			`found_needles[needle] = header_value`
Faster image annotation 2022-06-15 20:27:15 +02:00			`# modify y so it's aligned not with the top of the text, but with the midline`
			`y = y + h / 2`
			`radius = 30`
			`# offset both y and x`
Add header values to output 2022-07-08 10:40:16 +02:00			`print_y = y + y_offset`
			`print_x = x + x_offset`
Faster image annotation 2022-06-15 20:27:15 +02:00			`fill = "red"`
			`line_length = 200`
			`draw.regular_polygon(`
Add header values to output 2022-07-08 10:40:16 +02:00			`((print_x - radius - 5, print_y), radius), n_sides=3, rotation=270, fill=fill`
Faster image annotation 2022-06-15 20:27:15 +02:00			`)`
Add header values to output 2022-07-08 10:40:16 +02:00			`draw.line((print_x - radius - 5, print_y, print_x - line_length, print_y), fill=fill, width=10)`
Faster image annotation 2022-06-15 20:27:15 +02:00			`text_w, text_h = draw.textsize(comment, font)`
			`text_padding = 10`
			`draw.rectangle(`
			`[`
Add header values to output 2022-07-08 10:40:16 +02:00			`(print_x - line_length - text_w - text_padding, print_y - text_h / 2),`
			`(print_x - line_length + text_padding, print_y + text_h / 2),`
Faster image annotation 2022-06-15 20:27:15 +02:00			`],`
			`fill="white",`
			`)`
			`draw.text(`
Add header values to output 2022-07-08 10:40:16 +02:00			`(print_x - line_length - 10, print_y),`
Faster image annotation 2022-06-15 20:27:15 +02:00			`comment,`
			`fill=fill,`
			`anchor="rm",`
			`font=font,`
			`)`
Scale the images down so they aren't as huge 2022-06-19 13:52:43 +02:00			`im = im.resize((im.width // 2, im.height // 2))`
Faster image annotation 2022-06-15 20:27:15 +02:00			`im.save(output_file, "PNG")`
Add metadata to screenshots 2022-06-17 11:44:05 +02:00			`print(json.dumps({"new_file":`
			`{"url": base_url + "/static/" + output_file_relative,`
			`"domain": domain,`
			`"found_headers": found_needles}}))`