276 lines
7.9 KiB
TypeScript
276 lines
7.9 KiB
TypeScript
import { is, predicates } from "@sealcode/ts-predicates";
|
|
import { ChildProcessWithoutNullStreams, spawn } from "child_process";
|
|
import { Readable } from "stream";
|
|
import { v4 as uuid } from "uuid";
|
|
import containerPool from "./container-pool";
|
|
import { IMAGE_NAME, VOLUME_MOUNT } from "./docker-args";
|
|
import { q, requests } from "./memory";
|
|
|
|
let queue_order: ScreenshotRequest[] = [];
|
|
|
|
export type Image = {
|
|
url: string;
|
|
domain: string;
|
|
found_headers: Record<string, string>;
|
|
filename: string;
|
|
};
|
|
|
|
export default class ScreenshotRequest {
|
|
public id = uuid();
|
|
public status = "waiting";
|
|
public output = "";
|
|
public images: Image[] = [];
|
|
public request_time: number = Date.now();
|
|
public started_time: number | null = null;
|
|
public finished_time: number | null = null;
|
|
public processing_took: number | null = null;
|
|
public waiting_took: number | null = null;
|
|
public process: ChildProcessWithoutNullStreams;
|
|
public current_action = "Inicjalizowanie...";
|
|
|
|
constructor(public url: string, public domains: string[]) {
|
|
q.push(async () => {
|
|
return this.exec();
|
|
});
|
|
requests[this.id] = this;
|
|
queue_order.push(this);
|
|
}
|
|
|
|
getJobsAhead(): number {
|
|
if (this.status != "waiting") {
|
|
return 0;
|
|
}
|
|
let count = 0;
|
|
for (const request of queue_order) {
|
|
if (request == this) {
|
|
break;
|
|
}
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
async getJSON(): Promise<{
|
|
url: string;
|
|
domains: string[];
|
|
jobs_ahead: number;
|
|
id: string;
|
|
status: string;
|
|
output: string;
|
|
images: Image[];
|
|
request_time: number;
|
|
started_time: number | null;
|
|
finished_time: number | null;
|
|
processing_took: number | null;
|
|
waiting_took: number | null;
|
|
elapsed_time_s: number;
|
|
zip_url: string | null;
|
|
current_action: string;
|
|
preview: string;
|
|
}> {
|
|
const jobs_ahead = this.getJobsAhead();
|
|
return {
|
|
url: this.url,
|
|
current_action:
|
|
this.status == "finished"
|
|
? "Zakończono!"
|
|
: this.status == "waiting"
|
|
? `Oczekiwanie w kolejce. Przed Tobą jest ${jobs_ahead} ${
|
|
jobs_ahead === 1 ? "osoba" : "osób"
|
|
} w kolejce...`
|
|
: this.current_action,
|
|
domains: this.domains,
|
|
jobs_ahead,
|
|
id: this.id,
|
|
status: this.status,
|
|
output: this.output,
|
|
images: this.getGoodImages(),
|
|
request_time: this.request_time,
|
|
started_time: this.started_time,
|
|
finished_time: this.finished_time,
|
|
processing_took: this.processing_took,
|
|
waiting_took: this.waiting_took,
|
|
elapsed_time_s: Math.round(
|
|
((this.status === "finished" ? this.finished_time || -1 : Date.now()) -
|
|
this.request_time) /
|
|
1000
|
|
),
|
|
zip_url:
|
|
this.status === "finished"
|
|
? `/api/requests/${this.id}/all-screenshots`
|
|
: null,
|
|
preview: await this.getPreviewURL(),
|
|
};
|
|
}
|
|
|
|
async getPreviewURL(): Promise<string> {
|
|
const process = spawn("stat", [
|
|
"-c",
|
|
"%Y",
|
|
`${__dirname}/../../static/${this.id}/preview.jpg`,
|
|
]);
|
|
let result = "";
|
|
process.stdout.on("data", (data) => (result += data.toString().trim()));
|
|
process.stderr.on("data", (data) => console.log(data.toString()));
|
|
const mtime = await new Promise((resolve) => {
|
|
process.on("close", () => resolve(result));
|
|
});
|
|
return `/static/${this.id}/preview.jpg?v=${mtime}`;
|
|
}
|
|
|
|
getGoodImages(): Image[] {
|
|
/* find the best set of screenshots, that is: a set of screenshots that
|
|
contain all the header values that appear in the headers, but with as little
|
|
screenshots as possible.
|
|
|
|
The current approach is to sort the screenshots by which ones contain the
|
|
most highlighted information, and then go through them one by one and check
|
|
off the data that they provide and stop once all the values are checked off
|
|
*/
|
|
const result: Image[] = [];
|
|
const domains = Array.from(
|
|
new Set(this.images.map((image) => image.domain))
|
|
);
|
|
for (const domain of domains) {
|
|
const images = this.images
|
|
.filter((image) => image.domain === domain)
|
|
.sort((image1, image2) => {
|
|
if (
|
|
Object.values(image1.found_headers).length >
|
|
Object.values(image2.found_headers).length
|
|
) {
|
|
return -1;
|
|
} else if (
|
|
Object.values(image1.found_headers).length <
|
|
Object.values(image2.found_headers).length
|
|
) {
|
|
return 1;
|
|
} else {
|
|
// same amount of headers, see who has longest values
|
|
if (
|
|
Object.values(image1.found_headers).join("").length >
|
|
Object.values(image2.found_headers).join("").length
|
|
) {
|
|
return -1;
|
|
} else if (
|
|
Object.values(image1.found_headers).join("").length <
|
|
Object.values(image2.found_headers).join("").length
|
|
) {
|
|
return 1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
});
|
|
const all_values = Array.from(
|
|
new Set(
|
|
images
|
|
.map((image) => Object.values(image.found_headers))
|
|
.reduce((a, b) => a.concat(b))
|
|
)
|
|
);
|
|
const shown_values_for_domain = new Set();
|
|
for (const image of images) {
|
|
const values_in_image = Object.values(image.found_headers);
|
|
let any_new_values = false;
|
|
for (const value of values_in_image) {
|
|
if (!shown_values_for_domain.has(value)) {
|
|
shown_values_for_domain.add(value);
|
|
any_new_values = true;
|
|
}
|
|
}
|
|
if (any_new_values) {
|
|
result.push(image);
|
|
}
|
|
if (shown_values_for_domain.size == all_values.length) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
setFinished(): void {
|
|
this.status = "finished";
|
|
this.finished_time = Date.now();
|
|
if (this.started_time) {
|
|
this.processing_took = this.finished_time - this.started_time;
|
|
this.waiting_took = this.started_time - this.request_time;
|
|
}
|
|
}
|
|
|
|
async exec(): Promise<void> {
|
|
this.started_time = Date.now();
|
|
this.status = "running";
|
|
const container = containerPool.getContainer();
|
|
await container.waitReady();
|
|
return new Promise((resolve, reject) => {
|
|
this.process = spawn(
|
|
"docker",
|
|
[
|
|
"exec",
|
|
container.id,
|
|
"/opt/run-analysis.sh",
|
|
JSON.stringify({
|
|
url: this.url,
|
|
third_party_domains: this.domains,
|
|
}),
|
|
this.id,
|
|
],
|
|
{ cwd: process.cwd() }
|
|
);
|
|
this.process.on("close", (exitCode) => {
|
|
this.setFinished();
|
|
container.close();
|
|
queue_order = queue_order.filter((request) => request != this);
|
|
if (exitCode === 0) {
|
|
resolve();
|
|
} else {
|
|
reject();
|
|
}
|
|
});
|
|
this.process.stdout.on("data", (d: Buffer) => {
|
|
try {
|
|
const parsed = JSON.parse(d.toString()) as unknown;
|
|
if (
|
|
is(parsed, predicates.object) &&
|
|
is(parsed.new_file, predicates.object)
|
|
) {
|
|
this.images.push(parsed.new_file as Image);
|
|
}
|
|
if (
|
|
is(parsed, predicates.object) &&
|
|
is(parsed.current_action, predicates.string)
|
|
) {
|
|
this.current_action = parsed.current_action;
|
|
}
|
|
} catch (e) {
|
|
//noop
|
|
}
|
|
this.output += d.toString();
|
|
/* console.log("DATA!", d.toString()); */
|
|
});
|
|
this.process.stderr.on("data", (d: Buffer) => {
|
|
this.output += d.toString();
|
|
/* console.log("STDERR!", d.toString()); */
|
|
});
|
|
});
|
|
}
|
|
|
|
getZIP(): Readable {
|
|
const process = spawn("docker", [
|
|
"run",
|
|
"-v",
|
|
VOLUME_MOUNT,
|
|
IMAGE_NAME,
|
|
"zip",
|
|
"--junk-paths",
|
|
"-",
|
|
...this.getGoodImages().map(
|
|
(image) => `/opt/static/${this.id}/${image.filename}`
|
|
),
|
|
]);
|
|
return process.stdout;
|
|
}
|
|
}
|