Merge pull request 'Add firefox bloating (przedplamienie FF)' (#34) from #29 into master

Reviewed-on: #34
2022-11-29 12:45:46 +01:00 · 2022-11-29 12:45:46 +01:00 · e83113bbde
commit e83113bbde
parent 9cb9f9de30 a7e1443460
9 changed files with 72 additions and 16 deletions
--- a/@types/src/request.d.ts
+++ b/@types/src/request.d.ts
@ -1,5 +1,7 @@
 /// <reference types="node" />
+/// <reference types="node" />
 import { ChildProcessWithoutNullStreams } from "child_process";
+import { Readable } from "stream";
 export declare type Image = {
    url: string;
    domain: string;
@ -40,7 +42,9 @@ export default class ScreenshotRequest {
        current_action: string;
        preview: string;
    }>;
+    getPreviewURL(): Promise<string>;
    getGoodImages(): Image[];
    setFinished(): void;
    exec(): Promise<void>;
+    getZIP(): Readable;
 }
--- a/Docker/bloater.sh
+++ b/Docker/bloater.sh
@ -0,0 +1,35 @@
+#!/bin/bash
+BLOATING_DOMAINS=$(cat bloating-domains.txt)
+
+bloat_firefox(){
+	if [ "$#" = 0 ]; then
+		echo "Bloating Firefox by bloating defined domain list..."
+		DOMAINS_LIST=$(printf '%s\n' "${BLOATING_DOMAINS[@]}")
+	else
+		echo "Bloating Firefox by requested domain list..."
+		DOMAINS_LIST=`node filter-requested-domains.js "$1"`
+		echo "selected domains"
+		echo $DOMAINS_LIST
+	fi
+
+	if [ -n "$DOMAINS_LIST" ]; then
+		while IFS= read -r DOMAIN; do
+			# these domains return a 404 anyways, no need to waste time on them:
+			if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
+			load_website "$DOMAIN?hl=pl" "$DOMAIN"
+			sleep 1 # sometimes the consent popup needs a little time
+			open_console
+			grab "$DOMAIN before"
+			(tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
+			keycombo Control_L v
+			sleep 0.3
+			xdotool key Return
+			sleep 1.5
+			grab "$DOMAIN after"
+		done <<< "$DOMAINS_LIST"
+	else
+		echo "No need to bloat"
+	fi
+}
+
+
--- a/Docker/bloating-domains.txt
+++ b/Docker/bloating-domains.txt
@ -0,0 +1,6 @@
+facebook.com
+google.com
+hotjar.com
+maps.google.com
+linkedin.com
+cookielaw.org
--- a/Docker/filter-requested-domains.js
+++ b/Docker/filter-requested-domains.js
@ -0,0 +1,15 @@
+const fs = require("fs");
+const pth = require("path");
+const BLOATING_DOMAINS = (
+    fs.readFileSync(pth.join(__dirname, "bloating-domains.txt")) + ""
+).split("\n");
+const input = process.argv[2];
+const REQUESTED_DOMAINS = input.split('\n');
+
+const array_diff = REQUESTED_DOMAINS.filter(
+    (v) => !BLOATING_DOMAINS.includes(v)
+    );
+
+for (let i in array_diff) {
+    console.log(array_diff[i]);
+}
--- a/Docker/prepare-firefox.sh
+++ b/Docker/prepare-firefox.sh
@ -3,6 +3,7 @@
 source ./ephemeral-x.sh
 source ./annotate_header.sh
 source ./utils.sh
+source ./bloater.sh

 echo "{\"current_action\": \"Uruchamianie serwera X\"}"

@ -11,6 +12,9 @@ start_firefox
 grab start_firefox
 prepare_firefox
 grab prepare_firefox
+bloat_firefox
+grab bloat_firefox
+
 echo "{\"current_action\": \"Oczekiwanie na URL do analizy...\", \"code\": \"ready\"}"
 ./eternal-sleep.sh &
 wait
--- a/Docker/run-analysis.sh
+++ b/Docker/run-analysis.sh
@ -10,10 +10,13 @@ unquote(){
  echo $1 | sed 's/"//g'
 }

+echo $INPUT
+
 URL=$(unquote $(echo $INPUT | jq .url))
 DOMAINS=`node array-to-lines.js "$(echo $INPUT | jq .third_party_domains)"`

 source ./utils.sh
+source ./bloater.sh

 PREVIEW="TRUE" # set to "TRUE" in order to enable automatic screenshots kept in preview.png 

@ -28,21 +31,8 @@ fi

 ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL")

-
-while IFS= read -r DOMAIN; do
-  # these domains return a 404 anyways, no need to waste time on them:
-  if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
-  load_website "$DOMAIN?hl=pl" "$DOMAIN"
-  sleep 1 # sometimes the consent popup needs a little time
-  open_console
-  grab "$DOMAIN before"
-  (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
-  keycombo Control_L v
-  sleep 0.3
-  xdotool key Return
-  sleep 1.5
-  grab "$DOMAIN after"
-done <<< "$DOMAINS"
+bloat_firefox "$DOMAINS"
+grab bloat_firefox

 click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots

--- a/package-lock.json
+++ b/package-lock.json
@ -5,6 +5,7 @@
  "requires": true,
  "packages": {
    "": {
+      "name": "screenshot-service",
      "version": "1.0.0",
      "license": "ISC",
      "dependencies": {
--- a/src/index.ts
+++ b/src/index.ts
@ -58,6 +58,7 @@ router.get("/", async (ctx) => {
            type="text"
            name="domains"
            id="domains"
+            style="width: calc(100vw - 30%)"
            value="doubleclick.net,facebook.com"
          />
          <br />