const config = require('config'); const axios = require("axios"); const cheerio = require("cheerio"); const express = require('express'); const cors = require('cors'); var stream = require('stream'); var jsonxml = require('json2xml'); var beautify = require('xml-beautifier'); const MongoClient = require('mongodb').MongoClient; const ObjectID = require('mongodb').ObjectID; var apartments = require('./apartments.js'); var houses = require('./houses.js'); const distinct = (value, index, self) => { return self.indexOf(value) === index; } // jobs var mongoUrl = config.get("mongo"); var agendaDb = config.get("agenda"); const Agenda = require('agenda').Agenda; const agenda = new Agenda({ db: { address: agendaDb } }); agenda.define('scrape', async function (job, done) { const { _id } = job.attrs.data; try { const dbo = client.db(database); let collection = dbo.collection('scrapes'); let scrape = await collection.findOne({ _id: _id }); // for (var page = 1; page <= scrape.pageCount; page++) { for (var page = 1; page <= 1; page++) { console.log("scrapping page " + page) const filterPage = await axios(scrape.sourceUrl + `/${page}`); const html = filterPage.data; const $ = cheerio.load(html); const propertyLinks = $('#placardContainer .property-link').map(function () { return $(this).attr('href'); }).get(); const links = propertyLinks.filter(distinct); var properties = []; // var limit = 0; for (const link of links) { // if(limit == 1) break; try { var response = await axios(link); var property = apartments.apartment(cheerio.load(response.data)); property.url = link; properties.push(property); console.log(`${link} scraped.`) } catch (err) { console.error(`${link} scrape failed.`); } } } await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties, endDate: new Date() } }); console.log(`${_id} scraped.`); return done(); } catch (err) { console.log(err); } }); (async function () { await agenda.start(); })(); // express application const app = express(); app.use(express.json()); app.use(cors()); // database setup var mongoUrl = config.get("mongo"); var database = config.get("database"); var client = undefined; MongoClient.connect(mongoUrl, function (err, db) { if (err) throw err; console.log("Database created!"); console.log(mongoUrl); client = db; var dbo = db.db(database); dbo.createCollection("scrapes", function (err, res) { if (err) { console.log("Collection already created!"); return; } console.log("Collection created!"); }); }); app.get("/", async (req, res) => { return res.json("ok"); }); app.get("/scrapes/:id/files/xml", async (req, res) => { const id = req.params.id; try { const dbo = client.db(database); let collection = dbo.collection('scrapes'); var o_id = new ObjectID(id); let data = await collection.findOne({ _id: o_id }); // console.log("Data", data) var input = { ...data, _id: data._id.toString(), estimate: data.estimate?.toString(), createDate: data.estimate?.toString() }; let xml = jsonxml(input, { header: true }) xml.concat('') // console.log("XML", xml) var formattedXml = beautify(xml); // console.log("formattedXml", formattedXml) var fileContents = Buffer.from(formattedXml); var readStream = new stream.PassThrough(); readStream.end(fileContents); res.set('Content-disposition', 'attachment; filename=' + id + '.xml'); res.set('Content-Type', 'text/plain'); readStream.pipe(res); } catch (err) { console.log(err); res.status(500).json(); } }); app.get("/scrapes", async (req, res) => { try { const dbo = client.db(database); let collection = dbo.collection('scrapes'); let data = await collection.find({}).toArray(); return res.json(data); } catch (err) { console.log(err); return res.status(500).json(); } }); app.get("/scrapes/:id", async (req, res) => { const id = req.params.id; try { const dbo = client.db(database); let collection = dbo.collection('scrapes'); var o_id = new ObjectID(id); let data = await collection.findOne({ _id: o_id }); return res.json(data); } catch (err) { console.log(err); res.status(500).json(); } }); app.post("/scrapes/", async (req, res) => { const location = req.body.location; const description = req.body.description; const price = req.body.price; const beds = req.body.beds; const type = req.body.type; const lifestyle = req.body.lifestyle; const baths = req.body.baths; // query builder var query = buildQuery(type, location, beds, baths, price, lifestyle); console.log(query); const filterPage = await axios(query); const html = filterPage.data; const $ = cheerio.load(html); var $pageRange = $(".pageRange"); var pagesCount = 0; var resultCount = 0; if (!$pageRange.length) { let propertyLinks = $('#placardContainer .property-link').map(function () { return $(this).attr('href'); }).get(); if (!propertyLinks.length) { console.error("No results"); return res.status(404).json(); } resultCount = propertyLinks.length; } else { pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3); resultCount = pagesCount * 25; } try { const dbo = client.db(database); let collection = dbo.collection('scrapes'); const dt = new Date(); dt.setSeconds(dt.getSeconds() + resultCount); let res = await collection.insertOne({ count: resultCount, pageCount: pagesCount, estimate: dt, createDate: new Date(), sourceUrl: query, location: location, description: description, filters: [ { name: 'baths', value: baths }, { name: 'price', value: price }, { name: 'beds', value: beds }, { name: 'type', value: type }, { name: 'lifestyle', value: lifestyle }, ], status: "requested" }); console.log(res); } catch (err) { console.log(err); return res.status(500).json(); } return res.json(); }); app.post("/scrapes/estimate", async (req, res) => { const location = req.body.location; const description = req.body.description; const price = req.body.price; const beds = req.body.beds; const type = req.body.type; const lifestyle = req.body.lifestyle; const baths = req.body.baths; var query = buildQuery(type, location, beds, baths, price, lifestyle); console.log(query); const filterPage = await axios(query); const html = filterPage.data; const $ = cheerio.load(html); var $pageRange = $(".pageRange"); var pagesCount = 0; var resultCount = 0; if (!$pageRange.length) { let propertyLinks = $('#placardContainer .property-link').map(function () { return $(this).attr('href'); }).get(); if (!propertyLinks.length) { console.error("No results"); return res.status(404).json(); } resultCount = propertyLinks.length; } else { pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3); resultCount = pagesCount * 25; } const dt = new Date(); dt.setSeconds(dt.getSeconds() + resultCount); return res.json({ count: resultCount, pageCount: pagesCount, estimate: dt, }); }); app.patch("/scrapes/:id/execute", async (req, res) => { const id = req.params.id; try { const dbo = client.db(database); let collection = dbo.collection('scrapes'); var o_id = new ObjectID(id); var newvalues = { $set: { status: "pending", startDate: new Date() } }; await collection.updateOne({ _id: o_id }, newvalues); agenda.now('scrape', { _id: o_id }); return res.status(204).json(); } catch (err) { console.log(err); res.status(500).json(); } }); const port = 5501; app.listen(port, () => { console.log(`Example app listening at http://localhost:${port}`) }); function buildQuery(type, location, beds, baths, price, lifestyle) { var query = `https://www.apartments.com`; if (type) { query += `/${type}`; } if (location) { var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase(); query += `/${locationQuery}`; } if (beds) { query += `/${beds}-bedrooms`; } if (baths) { query += `${beds ? '-' : '/'}${baths}-bathrooms`; } if (price) { if (beds) { query += `-over-${price}`; } else { query += `/over-${price}`; } } if (lifestyle) { query += `/${lifestyle}`; } return query; } // Handles graceful stopping of jobs function graceful() { agenda.stop(function () { client.close(function (e) { if (e) logger.error(e); process.exit(0); }); }); } process.on('SIGTERM', graceful); process.on('SIGINT', graceful);