screenshot-service/src/request.ts
2022-07-15 14:15:38 +02:00

276 lines
7.9 KiB
TypeScript

import { is, predicates } from "@sealcode/ts-predicates";
import { ChildProcessWithoutNullStreams, spawn } from "child_process";
import { Readable } from "stream";
import { v4 as uuid } from "uuid";
import containerPool from "./container-pool";
import { IMAGE_NAME, VOLUME_MOUNT } from "./docker-args";
import { q, requests } from "./memory";
let queue_order: ScreenshotRequest[] = [];
export type Image = {
url: string;
domain: string;
found_headers: Record<string, string>;
filename: string;
};
export default class ScreenshotRequest {
public id = uuid();
public status = "waiting";
public output = "";
public images: Image[] = [];
public request_time: number = Date.now();
public started_time: number | null = null;
public finished_time: number | null = null;
public processing_took: number | null = null;
public waiting_took: number | null = null;
public process: ChildProcessWithoutNullStreams;
public current_action = "Inicjalizowanie...";
constructor(public url: string, public domains: string[]) {
q.push(async () => {
return this.exec();
});
requests[this.id] = this;
queue_order.push(this);
}
getJobsAhead(): number {
if (this.status != "waiting") {
return 0;
}
let count = 0;
for (const request of queue_order) {
if (request == this) {
break;
}
count++;
}
return count;
}
async getJSON(): Promise<{
url: string;
domains: string[];
jobs_ahead: number;
id: string;
status: string;
output: string;
images: Image[];
request_time: number;
started_time: number | null;
finished_time: number | null;
processing_took: number | null;
waiting_took: number | null;
elapsed_time_s: number;
zip_url: string | null;
current_action: string;
preview: string;
}> {
const jobs_ahead = this.getJobsAhead();
return {
url: this.url,
current_action:
this.status == "finished"
? "Zakończono!"
: this.status == "waiting"
? `Oczekiwanie w kolejce. Przed Tobą jest ${jobs_ahead} ${
jobs_ahead === 1 ? "osoba" : "osób"
} w kolejce...`
: this.current_action,
domains: this.domains,
jobs_ahead,
id: this.id,
status: this.status,
output: this.output,
images: this.getGoodImages(),
request_time: this.request_time,
started_time: this.started_time,
finished_time: this.finished_time,
processing_took: this.processing_took,
waiting_took: this.waiting_took,
elapsed_time_s: Math.round(
((this.status === "finished" ? this.finished_time || -1 : Date.now()) -
this.request_time) /
1000
),
zip_url:
this.status === "finished"
? `/api/requests/${this.id}/all-screenshots`
: null,
preview: await this.getPreviewURL(),
};
}
async getPreviewURL(): Promise<string> {
const process = spawn("stat", [
"-c",
"%Y",
`${__dirname}/../../static/${this.id}/preview.jpg`,
]);
let result = "";
process.stdout.on("data", (data) => (result += data.toString().trim()));
process.stderr.on("data", (data) => console.log(data.toString()));
const mtime = await new Promise((resolve) => {
process.on("close", () => resolve(result));
});
return `/static/${this.id}/preview.jpg?v=${mtime}`;
}
getGoodImages(): Image[] {
/* find the best set of screenshots, that is: a set of screenshots that
contain all the header values that appear in the headers, but with as little
screenshots as possible.
The current approach is to sort the screenshots by which ones contain the
most highlighted information, and then go through them one by one and check
off the data that they provide and stop once all the values are checked off
*/
const result: Image[] = [];
const domains = Array.from(
new Set(this.images.map((image) => image.domain))
);
for (const domain of domains) {
const images = this.images
.filter((image) => image.domain === domain)
.sort((image1, image2) => {
if (
Object.values(image1.found_headers).length >
Object.values(image2.found_headers).length
) {
return -1;
} else if (
Object.values(image1.found_headers).length <
Object.values(image2.found_headers).length
) {
return 1;
} else {
// same amount of headers, see who has longest values
if (
Object.values(image1.found_headers).join("").length >
Object.values(image2.found_headers).join("").length
) {
return -1;
} else if (
Object.values(image1.found_headers).join("").length <
Object.values(image2.found_headers).join("").length
) {
return 1;
} else {
return 0;
}
}
});
const all_values = Array.from(
new Set(
images
.map((image) => Object.values(image.found_headers))
.reduce((a, b) => a.concat(b))
)
);
const shown_values_for_domain = new Set();
for (const image of images) {
const values_in_image = Object.values(image.found_headers);
let any_new_values = false;
for (const value of values_in_image) {
if (!shown_values_for_domain.has(value)) {
shown_values_for_domain.add(value);
any_new_values = true;
}
}
if (any_new_values) {
result.push(image);
}
if (shown_values_for_domain.size == all_values.length) {
break;
}
}
}
return result;
}
setFinished(): void {
this.status = "finished";
this.finished_time = Date.now();
if (this.started_time) {
this.processing_took = this.finished_time - this.started_time;
this.waiting_took = this.started_time - this.request_time;
}
}
async exec(): Promise<void> {
this.started_time = Date.now();
this.status = "running";
const container = containerPool.getContainer();
await container.waitReady();
return new Promise((resolve, reject) => {
this.process = spawn(
"docker",
[
"exec",
container.id,
"/opt/run-analysis.sh",
JSON.stringify({
url: this.url,
third_party_domains: this.domains,
}),
this.id,
],
{ cwd: process.cwd() }
);
this.process.on("close", (exitCode) => {
this.setFinished();
container.close();
queue_order = queue_order.filter((request) => request != this);
if (exitCode === 0) {
resolve();
} else {
reject();
}
});
this.process.stdout.on("data", (d: Buffer) => {
try {
const parsed = JSON.parse(d.toString()) as unknown;
if (
is(parsed, predicates.object) &&
is(parsed.new_file, predicates.object)
) {
this.images.push(parsed.new_file as Image);
}
if (
is(parsed, predicates.object) &&
is(parsed.current_action, predicates.string)
) {
this.current_action = parsed.current_action;
}
} catch (e) {
//noop
}
this.output += d.toString();
/* console.log("DATA!", d.toString()); */
});
this.process.stderr.on("data", (d: Buffer) => {
this.output += d.toString();
/* console.log("STDERR!", d.toString()); */
});
});
}
getZIP(): Readable {
const process = spawn("docker", [
"run",
"-v",
VOLUME_MOUNT,
IMAGE_NAME,
"zip",
"--junk-paths",
"-",
...this.getGoodImages().map(
(image) => `/opt/static/${this.id}/${image.filename}`
),
]);
return process.stdout;
}
}