urqforum-parser/urq.js

100 lines
2.6 KiB
JavaScript

const puppeteer = require('puppeteer');
const mysql = require('mysql2/promise');
const config = require("./config.json");
async function asyncForEach (array, callback, args = []) {
for (let index = 0; index < array.length; index++) {
await callback(array[index], index, args)
}
}
class Urq {
constructor() {
this.dbopts = {
host: 'localhost',
database: config.DB_NAME,
user: config.DB_USER,
password: config.DB_PASSWORD,
protocol: 'mysql',
port: '3306',
query: {pool: true}
};
};
async scrape() {
this.db = await mysql.createConnection(this.dbopts);
this.browser = await puppeteer.launch({
"headless": true,
"args": [
"--disable-web-security",
"--no-sandbox",
"--disable-dev-shm-usage"
]
});
await this.db.execute(`
CREATE TABLE IF NOT EXISTS \`${config.DB_PREFIX}bordathreads\` (
\`id\` Int( 255 ) UNSIGNED AUTO_INCREMENT NOT NULL,
\`url\` Varchar(255) UNIQUE NOT NULL,
\`name\` Varchar(255) NOT NULL,
PRIMARY KEY ( \`id\` )
);`);
await this.scrape_pages();
await this.browser.close();
};
async scrape_pages() {
try {
console.log("Scraping threads.");
const page = await this.browser.newPage();
let threads = [];
for (let offset = 0; offset <= 400; offset += 20) {
console.log("Offset "+offset);
await page.goto("http://urq.borda.ru?0-0-"+offset, {
"waitUntil" : "load",
"timeout": 60000
});
let pagethreads = await page.evaluate(() => {
try {
let links = document.querySelectorAll(".font3 a");
let href = [];
for (let i = 0; i < links.length; i++) {
href.push({
"url": links[i].getAttribute("href"),
"name": links[i].innerHTML,
});
}
return href;
} catch (e) {
return [];
}
});
console.log(pagethreads);
for (let i = 0; i < pagethreads.length; i++) {
threads.push(pagethreads[i]);
}
process.exit();
}
for (let i = 0; i < threads.length; i++) {
try {
let [rows, fields] = await this.db.execute(`
INSERT INTO ${config.DB_PREFIX}bordathreads (url, name) VALUES( ?, ? )
`, [
threads[i].url,
threads[i].name,
]);
} catch(e) {
console.log(e);
}
}
await page.close();
} catch(e) {
console.log(e);
}
};
}
module.exports = Urq;