Add firefox bloating

2022-07-24 14:39:50 +02:00 · 2022-07-24 14:39:50 +02:00 · ed17c1a6e1
commit ed17c1a6e1
parent e11ef09e84
9 changed files with 99 additions and 15 deletions
--- a/@types/src/request.d.ts
+++ b/@types/src/request.d.ts
@ -1,5 +1,7 @@
 /// <reference types="node" />
 /// <reference types="node" />
 import { ChildProcessWithoutNullStreams } from "child_process";
 import { Readable } from "stream";
 export declare type Image = {
    url: string;
    domain: string;
@ -40,7 +42,9 @@ export default class ScreenshotRequest {
        current_action: string;
        preview: string;
    }>;
    getPreviewURL(): Promise<string>;
    getGoodImages(): Image[];
    setFinished(): void;
    exec(): Promise<void>;
    getZIP(): Readable;
 }
--- a/Docker/bloating-domains.txt
+++ b/Docker/bloating-domains.txt
@ -0,0 +1,15 @@
 facebook.com
 google.com
 hotjar.com
 maps.google.com
 linkedin.com
 cookielaw.org
 googletagmanager.com
 googleapis.com
 www.google.com
 sirdata.com
 xandr.com
 site.adform.com
 adtonos.com/pl/home-pl
 adtraction.com/pl
 www.cookiebot.com
--- a/Docker/bloatter.sh
+++ b/Docker/bloatter.sh
@ -0,0 +1,32 @@
 #!/bin/bash
 BLOATING_DOMAINS=$(while IFS= read -r line; do echo "$line"
 done <bloating-domains.txt)
 # $1 → 0 (mode bloating domains), → 1 (mode requested domains)
 bloat_firefox(){
 	if [ "$1" = 0 ]; then
        echo "Bloating Firefox by bloating defined domain list..."
 		DOMAINS=$(printf '%s\n' "${BLOATING_DOMAINS[@]}")
 	else 
        echo "Bloating Firefox by requested domain list..."
 		DOMAINS=$FILTERED_DOMAINS
 	fi
 	while IFS= read -r DOMAIN; do
 	# these domains return a 404 anyways, no need to waste time on them:
 	if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
 	load_website "$DOMAIN?hl=pl" "$DOMAIN"
 	sleep 1 # sometimes the consent popup needs a little time
 	open_console
 	grab "$DOMAIN before"
 	(tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
 	keycombo Control_L v
 	sleep 0.3
 	xdotool key Return
 	sleep 1.5
 	grab "$DOMAIN after"
 	done <<< "$DOMAINS"
 }
--- a/Docker/filter-requested-domains.js
+++ b/Docker/filter-requested-domains.js
@ -0,0 +1,15 @@
 const fs = require("fs");
 const pth = require("path");
 const BLOATING_DOMAINS = (
    fs.readFileSync(pth.join(__dirname, "bloating-domains.txt")) + ""
 ).split("\n");
 const input = process.argv[2];
 const REQUESTED_DOMAINS = JSON.parse(input);
 const array_diff = REQUESTED_DOMAINS.filter(
    (v) => !BLOATING_DOMAINS.includes(v)
 );
 for (let i in array_diff) {
    console.log(array_diff[i]);
 }
--- a/Docker/prepare-firefox.sh
+++ b/Docker/prepare-firefox.sh
@ -3,6 +3,7 @@
 source ./ephemeral-x.sh
 source ./annotate_header.sh
 source ./utils.sh
 source ./bloatter.sh
 echo "{\"current_action\": \"Uruchamianie serwera X\"}"
@ -11,6 +12,9 @@ start_firefox
 grab start_firefox
 prepare_firefox
 grab prepare_firefox
 bloat_firefox 0
 grab bloat_firefox
 echo "{\"current_action\": \"Oczekiwanie na URL do analizy...\", \"code\": \"ready\"}"
 ./eternal-sleep.sh &
 wait
--- a/Docker/run-analysis.sh
+++ b/Docker/run-analysis.sh
@ -9,10 +9,14 @@ unquote(){
  echo $1 | sed 's/"//g'
 }
 echo $INPUT
 URL=$(unquote $(echo $INPUT | jq .url))
 DOMAINS=`node array-to-lines.js "$(echo $INPUT | jq .third_party_domains)"`
 FILTERED_DOMAINS=`node filter-requested-domains.js "$(echo $INPUT | jq .third_party_domains)"`
 source ./utils.sh
 source ./bloatter.sh
 PREVIEW="TRUE" # set to "TRUE" in order to enable automatic screenshots kept in preview.png 
@ -28,20 +32,28 @@ fi
 ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL")
-while IFS= read -r DOMAIN; do
+if [ -z "$FILTERED_DOMAINS" ]
-  # these domains return a 404 anyways, no need to waste time on them:
+then
-  if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
+  echo "No need to blot"
-  load_website "$DOMAIN?hl=pl" "$DOMAIN"
+else
-  sleep 1 # sometimes the consent popup needs a little time
+  bloat_firefox 1
-  open_console
+  grab bloat_firefox
-  grab "$DOMAIN before"
+fi
-  (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
+
-  keycombo Control_L v
+# while IFS= read -r DOMAIN; do
-  sleep 0.3
+#   # these domains return a 404 anyways, no need to waste time on them:
-  xdotool key Return
+#   if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
-  sleep 1.5
+#   load_website "$DOMAIN?hl=pl" "$DOMAIN"
-  grab "$DOMAIN after"
+#   sleep 1 # sometimes the consent popup needs a little time
-done <<< "$DOMAINS"
+#   open_console
 #   grab "$DOMAIN before"
 #   (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
 #   keycombo Control_L v
 #   sleep 0.3
 #   xdotool key Return
 #   sleep 1.5
 #   grab "$DOMAIN after"
 # done <<< "$DOMAINS"
 click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots
--- a/package-lock.json
+++ b/package-lock.json
@ -5,6 +5,7 @@
  "requires": true,
  "packages": {
    "": {
      "name": "screenshot-service",
      "version": "1.0.0",
      "license": "ISC",
      "dependencies": {
--- a/src/index.ts
+++ b/src/index.ts
@ -58,6 +58,7 @@ router.get("/", async (ctx) => {
            type="text"
            name="domains"
            id="domains"
            style="width: calc(100vw - 30%)"
            value="doubleclick.net,facebook.com"
          />
          <br />