# test with: # docker image build -t headless-fox Docker && docker run -v $PWD/static:/opt/static -ti headless-fox time python annotate_header.py screenshot.png "/opt/static/output.png" "content-type" "Tutaj jest content-type" "etag" "Tutaj jest etag z długim opisem co ma wiele słów i wychodzi poza network inspector" import os import sys import pytesseract import uuid import json from pytesseract import Output from PIL import Image, ImageDraw, ImageFont output_file_relative = sys.argv[ 1 ] output_file = "/opt/static/" + output_file_relative output_dir = os.path.dirname(output_file) output_suffix = os.path.basename(output_file) domain = sys.argv[2] needles = sys.argv[3:] base_url = os.getenv("BASE_URL") # generator def partition(lst, size): for i in range(0, len(lst), size): yield lst[i : i + size] # print(d) with Image.open(output_file) as im: x_offset = 2054 y_offset = 313 cropped = im.crop((x_offset, y_offset, 2875, 1558)) cropped_filename = "/opt/static/" + uuid.uuid4().hex + ".png" cropped.save(cropped_filename) d = pytesseract.image_to_data(cropped_filename, output_type=Output.DICT) os.remove(cropped_filename) draw = ImageDraw.Draw(im) n_boxes = len(d["level"]) font = ImageFont.truetype("/usr/share/fonts/noto/NotoSansDisplay-Medium.ttf", 48) found_needles = {} for i in range(n_boxes): (x, y, w, h, text) = ( d["left"][i], d["top"][i], d["width"][i], d["height"][i], d["text"][i], ) if abs(x - 59) > 2: # it means that this is not a header name continue for [needle, comment, min_value_length, value_must_include] in partition(needles, 4): if needle.lower() in text.lower(): header_value_chunks = [] for i in range(n_boxes): (other_x, other_y, other_w, other_h, other_text) = ( d["left"][i], d["top"][i], d["width"][i], d["height"][i], d["text"][i], ) if abs(other_x - x) <= 4: # it means that this is a header name continue if abs(other_y - y) <= 4: header_value_chunks.append((other_text, other_x)) header_value_chunks.sort(key=lambda y: y[1]) header_value = "" for chunk in header_value_chunks: header_value = header_value + chunk[0] if len(header_value) < int(min_value_length): continue if not (value_must_include.lower() in header_value.lower()): continue found_needles[needle] = header_value # modify y so it's aligned not with the top of the text, but with the midline y = y + h / 2 radius = 30 # offset both y and x print_y = y + y_offset print_x = x + x_offset fill = "#ff726b" line_length = 200 draw.regular_polygon( ((print_x - radius - 5, print_y), radius), n_sides=3, rotation=270, fill=fill ) draw.line((print_x - radius - 5, print_y, print_x - line_length, print_y), fill=fill, width=10) text_w, text_h = draw.textsize(comment, font) text_padding = 10 draw.rectangle( [ (print_x - line_length - text_w - text_padding, print_y - text_h / 2), (print_x - line_length + text_padding, print_y + text_h / 2), ], fill="white", ) draw.text( (print_x - line_length - 10, print_y), comment, fill=fill, anchor="rm", font=font, ) if len(found_needles) == 0: exit(0) os.remove(output_file) im = im.resize((im.width // 2, im.height // 2)) im.save(output_dir + "/" + domain.replace(".", "_") + "_" + output_suffix, "PNG") print(json.dumps({"new_file": {"url": base_url + "/static/" + output_file_relative, "domain": domain, "found_headers": found_needles}}))