Merge pull request 'Add firefox bloating (przedplamienie FF)' (#34) from #29 into master

Reviewed-on: #34
This commit is contained in:
Arkadiusz Wieczorek 2022-11-29 12:45:46 +01:00
commit e83113bbde
9 changed files with 72 additions and 16 deletions

View File

@ -1,5 +1,7 @@
/// <reference types="node" /> /// <reference types="node" />
/// <reference types="node" />
import { ChildProcessWithoutNullStreams } from "child_process"; import { ChildProcessWithoutNullStreams } from "child_process";
import { Readable } from "stream";
export declare type Image = { export declare type Image = {
url: string; url: string;
domain: string; domain: string;
@ -40,7 +42,9 @@ export default class ScreenshotRequest {
current_action: string; current_action: string;
preview: string; preview: string;
}>; }>;
getPreviewURL(): Promise<string>;
getGoodImages(): Image[]; getGoodImages(): Image[];
setFinished(): void; setFinished(): void;
exec(): Promise<void>; exec(): Promise<void>;
getZIP(): Readable;
} }

35
Docker/bloater.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
BLOATING_DOMAINS=$(cat bloating-domains.txt)
bloat_firefox(){
if [ "$#" = 0 ]; then
echo "Bloating Firefox by bloating defined domain list..."
DOMAINS_LIST=$(printf '%s\n' "${BLOATING_DOMAINS[@]}")
else
echo "Bloating Firefox by requested domain list..."
DOMAINS_LIST=`node filter-requested-domains.js "$1"`
echo "selected domains"
echo $DOMAINS_LIST
fi
if [ -n "$DOMAINS_LIST" ]; then
while IFS= read -r DOMAIN; do
# these domains return a 404 anyways, no need to waste time on them:
if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
load_website "$DOMAIN?hl=pl" "$DOMAIN"
sleep 1 # sometimes the consent popup needs a little time
open_console
grab "$DOMAIN before"
(tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
keycombo Control_L v
sleep 0.3
xdotool key Return
sleep 1.5
grab "$DOMAIN after"
done <<< "$DOMAINS_LIST"
else
echo "No need to bloat"
fi
}

View File

@ -0,0 +1,6 @@
facebook.com
google.com
hotjar.com
maps.google.com
linkedin.com
cookielaw.org

View File

@ -0,0 +1,15 @@
const fs = require("fs");
const pth = require("path");
const BLOATING_DOMAINS = (
fs.readFileSync(pth.join(__dirname, "bloating-domains.txt")) + ""
).split("\n");
const input = process.argv[2];
const REQUESTED_DOMAINS = input.split('\n');
const array_diff = REQUESTED_DOMAINS.filter(
(v) => !BLOATING_DOMAINS.includes(v)
);
for (let i in array_diff) {
console.log(array_diff[i]);
}

View File

@ -3,6 +3,7 @@
source ./ephemeral-x.sh source ./ephemeral-x.sh
source ./annotate_header.sh source ./annotate_header.sh
source ./utils.sh source ./utils.sh
source ./bloater.sh
echo "{\"current_action\": \"Uruchamianie serwera X\"}" echo "{\"current_action\": \"Uruchamianie serwera X\"}"
@ -11,6 +12,9 @@ start_firefox
grab start_firefox grab start_firefox
prepare_firefox prepare_firefox
grab prepare_firefox grab prepare_firefox
bloat_firefox
grab bloat_firefox
echo "{\"current_action\": \"Oczekiwanie na URL do analizy...\", \"code\": \"ready\"}" echo "{\"current_action\": \"Oczekiwanie na URL do analizy...\", \"code\": \"ready\"}"
./eternal-sleep.sh & ./eternal-sleep.sh &
wait wait

View File

@ -10,10 +10,13 @@ unquote(){
echo $1 | sed 's/"//g' echo $1 | sed 's/"//g'
} }
echo $INPUT
URL=$(unquote $(echo $INPUT | jq .url)) URL=$(unquote $(echo $INPUT | jq .url))
DOMAINS=`node array-to-lines.js "$(echo $INPUT | jq .third_party_domains)"` DOMAINS=`node array-to-lines.js "$(echo $INPUT | jq .third_party_domains)"`
source ./utils.sh source ./utils.sh
source ./bloater.sh
PREVIEW="TRUE" # set to "TRUE" in order to enable automatic screenshots kept in preview.png PREVIEW="TRUE" # set to "TRUE" in order to enable automatic screenshots kept in preview.png
@ -28,21 +31,8 @@ fi
ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL") ORIGIN_DOMAIN=$(sed -e 's/[^/]*\/\/\([^@]*@\)\?\([^:/]*\).*/\2/' <<< "$URL")
bloat_firefox "$DOMAINS"
while IFS= read -r DOMAIN; do grab bloat_firefox
# these domains return a 404 anyways, no need to waste time on them:
if is_http_error "$DOMAIN"; then echo "skipping $DOMAIN"; continue; fi
load_website "$DOMAIN?hl=pl" "$DOMAIN"
sleep 1 # sometimes the consent popup needs a little time
open_console
grab "$DOMAIN before"
(tr '\n' ' ' < click-accept-all.js) | xclip -sel clip
keycombo Control_L v
sleep 0.3
xdotool key Return
sleep 1.5
grab "$DOMAIN after"
done <<< "$DOMAINS"
click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots click 1270 217 # the "trash" icon, so requests from plamienie don't appear in the screenshots

1
package-lock.json generated
View File

@ -5,6 +5,7 @@
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "screenshot-service",
"version": "1.0.0", "version": "1.0.0",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {

View File

@ -58,6 +58,7 @@ router.get("/", async (ctx) => {
type="text" type="text"
name="domains" name="domains"
id="domains" id="domains"
style="width: calc(100vw - 30%)"
value="doubleclick.net,facebook.com" value="doubleclick.net,facebook.com"
/> />
<br /> <br />