From ed17c1a6e18e9aa040f24c78e48e451939799cd0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Wieczorek Date: Sun, 24 Jul 2022 14:39:50 +0200 Subject: [PATCH 1/7] Add firefox bloating --- @types/src/request.d.ts | 4 +++ Docker/array-to-lines.js | 2 +- Docker/bloating-domains.txt | 15 +++++++++++ Docker/bloatter.sh | 32 ++++++++++++++++++++++++ Docker/filter-requested-domains.js | 15 +++++++++++ Docker/prepare-firefox.sh | 4 +++ Docker/run-analysis.sh | 40 +++++++++++++++++++----------- package-lock.json | 1 + src/index.ts | 1 + 9 files changed, 99 insertions(+), 15 deletions(-) create mode 100644 Docker/bloating-domains.txt create mode 100755 Docker/bloatter.sh create mode 100644 Docker/filter-requested-domains.js diff --git a/@types/src/request.d.ts b/@types/src/request.d.ts index 024a620..0e63332 100644 --- a/@types/src/request.d.ts +++ b/@types/src/request.d.ts @@ -1,5 +1,7 @@ /// +/// import { ChildProcessWithoutNullStreams } from "child_process"; +import { Readable } from "stream"; export declare type Image = { url: string; domain: string; @@ -40,7 +42,9 @@ export default class ScreenshotRequest { current_action: string; preview: string; }>; + getPreviewURL(): Promise; getGoodImages(): Image[]; setFinished(): void; exec(): Promise; + getZIP(): Readable; } diff --git a/Docker/array-to-lines.js b/Docker/array-to-lines.js index b5d4e51..d9cd1be 100644 --- a/Docker/array-to-lines.js +++ b/Docker/array-to-lines.js @@ -1,5 +1,5 @@ const input = process.argv[2]; const array = JSON.parse(input); for (let i in array) { - console.log(array[i]); + console.log(array[i]); } diff --git a/Docker/bloating-domains.txt b/Docker/bloating-domains.txt new file mode 100644 index 0000000..9803b74 --- /dev/null +++ b/Docker/bloating-domains.txt @@ -0,0 +1,15 @@ +facebook.com +google.com +hotjar.com +maps.google.com +linkedin.com +cookielaw.org +googletagmanager.com +googleapis.com +www.google.com +sirdata.com +xandr.com +site.adform.com +adtonos.com/pl/home-pl +adtraction.com/pl +www.cookiebot.com \ No newline at end of file diff --git a/Docker/bloatter.sh b/Docker/bloatter.sh new file mode 100755 index 0000000..be8b65b --- /dev/null +++ b/Docker/bloatter.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +BLOATING_DOMAINS=$(while IFS= read -r line; do echo "$line" +done !BLOATING_DOMAINS.includes(v) +); + +for (let i in array_diff) { + console.log(array_diff[i]); +} diff --git a/Docker/prepare-firefox.sh b/Docker/prepare-firefox.sh index 7763a30..193288d 100755 --- a/Docker/prepare-firefox.sh +++ b/Docker/prepare-firefox.sh @@ -3,6 +3,7 @@ source ./ephemeral-x.sh source ./annotate_header.sh source ./utils.sh +source ./bloatter.sh echo "{\"current_action\": \"Uruchamianie serwera X\"}" @@ -11,6 +12,9 @@ start_firefox grab start_firefox prepare_firefox grab prepare_firefox +bloat_firefox 0 +grab bloat_firefox + echo "{\"current_action\": \"Oczekiwanie na URL do analizy...\", \"code\": \"ready\"}" ./eternal-sleep.sh & wait diff --git a/Docker/run-analysis.sh b/Docker/run-analysis.sh index 229f580..5f61dac 100755 --- a/Docker/run-analysis.sh +++ b/Docker/run-analysis.sh @@ -9,10 +9,14 @@ unquote(){ echo $1 | sed 's/"//g' } +echo $INPUT + URL=$(unquote $(echo $INPUT | jq .url)) DOMAINS=`node array-to-lines.js "$(echo $INPUT | jq .third_party_domains)"` +FILTERED_DOMAINS=`node filter-requested-domains.js "$(echo $INPUT | jq .third_party_domains)"` source ./utils.sh +source ./bloatter.sh PREVIEW="TRUE" # set to "TRUE" in order to enable automatic screenshots kept in preview.png @@ -28,20 +32,28 @@ fi ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL") -while IFS= read -r DOMAIN; do - # these domains return a 404 anyways, no need to waste time on them: - if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi - load_website "$DOMAIN?hl=pl" "$DOMAIN" - sleep 1 # sometimes the consent popup needs a little time - open_console - grab "$DOMAIN before" - (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip - keycombo Control_L v - sleep 0.3 - xdotool key Return - sleep 1.5 - grab "$DOMAIN after" -done <<< "$DOMAINS" +if [ -z "$FILTERED_DOMAINS" ] +then + echo "No need to blot" +else + bloat_firefox 1 + grab bloat_firefox +fi + +# while IFS= read -r DOMAIN; do +# # these domains return a 404 anyways, no need to waste time on them: +# if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi +# load_website "$DOMAIN?hl=pl" "$DOMAIN" +# sleep 1 # sometimes the consent popup needs a little time +# open_console +# grab "$DOMAIN before" +# (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip +# keycombo Control_L v +# sleep 0.3 +# xdotool key Return +# sleep 1.5 +# grab "$DOMAIN after" +# done <<< "$DOMAINS" click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots diff --git a/package-lock.json b/package-lock.json index 2dcc549..171ac4f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5,6 +5,7 @@ "requires": true, "packages": { "": { + "name": "screenshot-service", "version": "1.0.0", "license": "ISC", "dependencies": { diff --git a/src/index.ts b/src/index.ts index b5237b4..d43eed8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -58,6 +58,7 @@ router.get("/", async (ctx) => { type="text" name="domains" id="domains" + style="width: calc(100vw - 30%)" value="doubleclick.net,facebook.com" />
From 33de898d1d5f7a884429d8abc8daaa4e37e49634 Mon Sep 17 00:00:00 2001 From: Arkadiusz Wieczorek Date: Sun, 24 Jul 2022 14:41:29 +0200 Subject: [PATCH 2/7] Cleanup --- Docker/run-analysis.sh | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/Docker/run-analysis.sh b/Docker/run-analysis.sh index 5f61dac..f237350 100755 --- a/Docker/run-analysis.sh +++ b/Docker/run-analysis.sh @@ -31,7 +31,6 @@ fi ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL") - if [ -z "$FILTERED_DOMAINS" ] then echo "No need to blot" @@ -40,21 +39,6 @@ else grab bloat_firefox fi -# while IFS= read -r DOMAIN; do -# # these domains return a 404 anyways, no need to waste time on them: -# if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi -# load_website "$DOMAIN?hl=pl" "$DOMAIN" -# sleep 1 # sometimes the consent popup needs a little time -# open_console -# grab "$DOMAIN before" -# (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip -# keycombo Control_L v -# sleep 0.3 -# xdotool key Return -# sleep 1.5 -# grab "$DOMAIN after" -# done <<< "$DOMAINS" - click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots load_website "$URL" "$URL" From 40b55dcac493203a288bcc355d6f18b212e03f2d Mon Sep 17 00:00:00 2001 From: Arkadiusz Wieczorek Date: Sun, 31 Jul 2022 12:19:32 +0200 Subject: [PATCH 3/7] CR --- Docker/bloater.sh | 35 +++++++++++++++++++++++++++++++++++ Docker/bloatter.sh | 32 -------------------------------- Docker/prepare-firefox.sh | 4 ++-- Docker/run-analysis.sh | 12 +++--------- 4 files changed, 40 insertions(+), 43 deletions(-) create mode 100755 Docker/bloater.sh delete mode 100755 Docker/bloatter.sh diff --git a/Docker/bloater.sh b/Docker/bloater.sh new file mode 100755 index 0000000..e7ba8ee --- /dev/null +++ b/Docker/bloater.sh @@ -0,0 +1,35 @@ +#!/bin/bash +BLOATING_DOMAINS=$(cat bloating-domains.txt) + +bloat_firefox(){ + if [ "$#" = 0 ]; then + echo "Bloating Firefox by bloating defined domain list..." + DOMAINS=$(printf '%s\n' "${BLOATING_DOMAINS[@]}") + else + echo "Bloating Firefox by requested domain list..." + DOMAINS=`node filter-requested-domains.js "$(echo $1 | jq .third_party_domains)"` + echo "selected domains" + echo $DOMAINS + fi + + if [ -n "$DOMAINS" ]; then + while IFS= read -r DOMAIN; do + # these domains return a 404 anyways, no need to waste time on them: + if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi + load_website "$DOMAIN?hl=pl" "$DOMAIN" + sleep 1 # sometimes the consent popup needs a little time + open_console + grab "$DOMAIN before" + (tr '\n' ' ' < click-accept-all.js) | xclip -sel clip + keycombo Control_L v + sleep 0.3 + xdotool key Return + sleep 1.5 + grab "$DOMAIN after" + done <<< "$DOMAINS" + else + echo "No need to blot" + fi +} + + diff --git a/Docker/bloatter.sh b/Docker/bloatter.sh deleted file mode 100755 index be8b65b..0000000 --- a/Docker/bloatter.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -BLOATING_DOMAINS=$(while IFS= read -r line; do echo "$line" -done Date: Tue, 16 Aug 2022 12:42:21 +0200 Subject: [PATCH 4/7] Fix typo --- Docker/bloater.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Docker/bloater.sh b/Docker/bloater.sh index e7ba8ee..ee38cf6 100755 --- a/Docker/bloater.sh +++ b/Docker/bloater.sh @@ -28,7 +28,7 @@ bloat_firefox(){ grab "$DOMAIN after" done <<< "$DOMAINS" else - echo "No need to blot" + echo "No need to bloat" fi } From 4b1db397e28d74b10cec92976c8c31fc3afdf836 Mon Sep 17 00:00:00 2001 From: Arkadiusz Wieczorek Date: Mon, 12 Sep 2022 09:50:13 +0200 Subject: [PATCH 5/7] Update --- Docker/bloater.sh | 2 +- Docker/run-analysis.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Docker/bloater.sh b/Docker/bloater.sh index ee38cf6..67001fa 100755 --- a/Docker/bloater.sh +++ b/Docker/bloater.sh @@ -7,7 +7,7 @@ bloat_firefox(){ DOMAINS=$(printf '%s\n' "${BLOATING_DOMAINS[@]}") else echo "Bloating Firefox by requested domain list..." - DOMAINS=`node filter-requested-domains.js "$(echo $1 | jq .third_party_domains)"` + DOMAINS=`node filter-requested-domains.js "$1"` echo "selected domains" echo $DOMAINS fi diff --git a/Docker/run-analysis.sh b/Docker/run-analysis.sh index 4a57dc2..beb22a7 100755 --- a/Docker/run-analysis.sh +++ b/Docker/run-analysis.sh @@ -30,7 +30,7 @@ fi ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL") -bloat_firefox $INPUT +bloat_firefox "$DOMAINS" grab bloat_firefox click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots From 8e2381abdbfda0b5fb705f73dd068add9dd6ec95 Mon Sep 17 00:00:00 2001 From: Kuba Orlik Date: Mon, 12 Sep 2022 19:22:47 +0200 Subject: [PATCH 6/7] =?UTF-8?q?Skr=C3=B3ci=C5=82em=20list=C4=99=20domen=20?= =?UTF-8?q?do=20przedplamienia,=20bo=20cz=C4=99=C5=9B=C4=87=20z=20nich=20j?= =?UTF-8?q?ednak=20wyst=C4=99puje=20dosy=C4=87=20rzadko,=20a=20op=C3=B3?= =?UTF-8?q?=C5=BAniaj=C4=85=20start=20kontenera?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Docker/bloating-domains.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Docker/bloating-domains.txt b/Docker/bloating-domains.txt index 9803b74..17cf387 100644 --- a/Docker/bloating-domains.txt +++ b/Docker/bloating-domains.txt @@ -4,12 +4,3 @@ hotjar.com maps.google.com linkedin.com cookielaw.org -googletagmanager.com -googleapis.com -www.google.com -sirdata.com -xandr.com -site.adform.com -adtonos.com/pl/home-pl -adtraction.com/pl -www.cookiebot.com \ No newline at end of file From 843b6efc4d929da0818e1081a9f930612f5a7fe4 Mon Sep 17 00:00:00 2001 From: Arkadiusz Wieczorek Date: Sun, 20 Nov 2022 13:09:02 +0100 Subject: [PATCH 7/7] Fix error with parsing domain list --- Docker/bloater.sh | 10 +++++----- Docker/bloating-domains.txt | 16 ++++++++-------- Docker/filter-requested-domains.js | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Docker/bloater.sh b/Docker/bloater.sh index 67001fa..9fe7016 100755 --- a/Docker/bloater.sh +++ b/Docker/bloater.sh @@ -4,15 +4,15 @@ BLOATING_DOMAINS=$(cat bloating-domains.txt) bloat_firefox(){ if [ "$#" = 0 ]; then echo "Bloating Firefox by bloating defined domain list..." - DOMAINS=$(printf '%s\n' "${BLOATING_DOMAINS[@]}") + DOMAINS_LIST=$(printf '%s\n' "${BLOATING_DOMAINS[@]}") else echo "Bloating Firefox by requested domain list..." - DOMAINS=`node filter-requested-domains.js "$1"` + DOMAINS_LIST=`node filter-requested-domains.js "$1"` echo "selected domains" - echo $DOMAINS + echo $DOMAINS_LIST fi - if [ -n "$DOMAINS" ]; then + if [ -n "$DOMAINS_LIST" ]; then while IFS= read -r DOMAIN; do # these domains return a 404 anyways, no need to waste time on them: if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi @@ -26,7 +26,7 @@ bloat_firefox(){ xdotool key Return sleep 1.5 grab "$DOMAIN after" - done <<< "$DOMAINS" + done <<< "$DOMAINS_LIST" else echo "No need to bloat" fi diff --git a/Docker/bloating-domains.txt b/Docker/bloating-domains.txt index 9803b74..a0709fa 100644 --- a/Docker/bloating-domains.txt +++ b/Docker/bloating-domains.txt @@ -1,15 +1,15 @@ facebook.com google.com -hotjar.com -maps.google.com -linkedin.com -cookielaw.org -googletagmanager.com googleapis.com -www.google.com +googletagmanager.com +hotjar.com +linkedin.com +maps.google.com sirdata.com -xandr.com site.adform.com +www.cookiebot.com +www.google.com +xandr.com adtonos.com/pl/home-pl adtraction.com/pl -www.cookiebot.com \ No newline at end of file +cookielaw.org \ No newline at end of file diff --git a/Docker/filter-requested-domains.js b/Docker/filter-requested-domains.js index 637f41c..75e13a2 100644 --- a/Docker/filter-requested-domains.js +++ b/Docker/filter-requested-domains.js @@ -4,11 +4,11 @@ const BLOATING_DOMAINS = ( fs.readFileSync(pth.join(__dirname, "bloating-domains.txt")) + "" ).split("\n"); const input = process.argv[2]; -const REQUESTED_DOMAINS = JSON.parse(input); +const REQUESTED_DOMAINS = input.split('\n'); const array_diff = REQUESTED_DOMAINS.filter( (v) => !BLOATING_DOMAINS.includes(v) -); + ); for (let i in array_diff) { console.log(array_diff[i]);