123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333 |
- const config = require('config');
- const axios = require("axios");
- const cheerio = require("cheerio");
- const express = require('express');
- const cors = require('cors');
- var stream = require('stream');
- var jsonxml = require('json2xml');
- var beautify = require('xml-beautifier');
- const MongoClient = require('mongodb').MongoClient;
- const ObjectID = require('mongodb').ObjectID;
-
- var apartments = require('./apartments.js');
- var houses = require('./houses.js');
-
- const distinct = (value, index, self) => {
- return self.indexOf(value) === index;
- }
- // jobs
- var mongoUrl = config.get("mongo");
- var agendaDb = config.get("agenda");
- const Agenda = require('agenda').Agenda;
- const agenda = new Agenda({ db: { address: agendaDb } });
- agenda.define('scrape', async function (job, done) {
- const { _id } = job.attrs.data;
- try {
- const dbo = client.db(database);
- let collection = dbo.collection('scrapes');
- let scrape = await collection.findOne({ _id: _id });
-
- // for (var page = 1; page <= scrape.pageCount; page++) {
- for (var page = 1; page <= 1; page++) {
- console.log("scrapping page " + page)
-
- const filterPage = await axios(scrape.sourceUrl + `/${page}`);
- const html = filterPage.data;
- const $ = cheerio.load(html);
-
- const propertyLinks = $('#placardContainer .property-link').map(function () {
- return $(this).attr('href');
- }).get();
-
- const links = propertyLinks.filter(distinct);
-
- var properties = [];
- // var limit = 0;
- for (const link of links) {
- // if(limit == 1) break;
- try {
- var response = await axios(link);
-
- var property = apartments.apartment(cheerio.load(response.data));
- property.url = link;
- properties.push(property);
- console.log(`${link} scraped.`)
- } catch (err) {
- console.error(`${link} scrape failed.`);
- }
- }
- }
-
- await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties, endDate: new Date() } });
- console.log(`${_id} scraped.`);
- return done();
- } catch (err) {
- console.log(err);
- }
- });
- (async function () {
- await agenda.start();
- })();
-
- // express application
- const app = express();
-
- app.use(express.json());
- app.use(cors());
-
- // database setup
- var mongoUrl = config.get("mongo");
- var database = config.get("database");
-
- var client = undefined;
- MongoClient.connect(mongoUrl, function (err, db) {
- if (err) throw err;
- console.log("Database created!");
- console.log(mongoUrl);
- client = db;
-
- var dbo = db.db(database);
- dbo.createCollection("scrapes", function (err, res) {
- if (err) {
- console.log("Collection already created!");
- return;
- }
- console.log("Collection created!");
- });
- });
-
- app.get("/", async (req, res) => {
- return res.json("ok");
- });
-
- app.get("/scrapes/:id/files/xml", async (req, res) => {
- const id = req.params.id;
- try {
- const dbo = client.db(database);
- let collection = dbo.collection('scrapes');
-
- var o_id = new ObjectID(id);
- let data = await collection.findOne({ _id: o_id });
- // console.log("Data", data)
-
- var input = {
- ...data,
- _id: data._id.toString(),
- estimate: data.estimate?.toString(),
- createDate: data.estimate?.toString()
- };
-
- let xml = jsonxml(input, { header: true })
- xml.concat('<root>')
- // console.log("XML", xml)
- var formattedXml = beautify(xml);
- // console.log("formattedXml", formattedXml)
- var fileContents = Buffer.from(formattedXml);
-
- var readStream = new stream.PassThrough();
- readStream.end(fileContents);
-
- res.set('Content-disposition', 'attachment; filename=' + id + '.xml');
- res.set('Content-Type', 'text/plain');
-
- readStream.pipe(res);
- } catch (err) {
- console.log(err);
- res.status(500).json();
- }
- });
-
- app.get("/scrapes", async (req, res) => {
- try {
- const dbo = client.db(database);
- let collection = dbo.collection('scrapes');
- let data = await collection.find({}).toArray();
- return res.json(data);
- } catch (err) {
- console.log(err);
- return res.status(500).json();
- }
- });
- app.get("/scrapes/:id", async (req, res) => {
- const id = req.params.id;
- try {
- const dbo = client.db(database);
- let collection = dbo.collection('scrapes');
-
- var o_id = new ObjectID(id);
- let data = await collection.findOne({ _id: o_id });
- return res.json(data);
- } catch (err) {
- console.log(err);
- res.status(500).json();
- }
- });
- app.post("/scrapes/", async (req, res) => {
- const location = req.body.location;
- const description = req.body.description;
- const price = req.body.price;
- const beds = req.body.beds;
- const type = req.body.type;
- const lifestyle = req.body.lifestyle;
- const baths = req.body.baths;
-
- // query builder
- var query = buildQuery(type, location, beds, baths, price, lifestyle);
-
- console.log(query);
-
- const filterPage = await axios(query);
- const html = filterPage.data;
- const $ = cheerio.load(html);
- var $pageRange = $(".pageRange");
- var pagesCount = 0;
- var resultCount = 0;
- if (!$pageRange.length) {
- let propertyLinks = $('#placardContainer .property-link').map(function () {
- return $(this).attr('href');
- }).get();
- if (!propertyLinks.length) {
- console.error("No results");
- return res.status(404).json();
- }
- resultCount = propertyLinks.length;
- } else {
- pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
- resultCount = pagesCount * 25;
- }
-
- try {
- const dbo = client.db(database);
- let collection = dbo.collection('scrapes');
- const dt = new Date();
- dt.setSeconds(dt.getSeconds() + resultCount);
- let res = await collection.insertOne({
- count: resultCount,
- pageCount: pagesCount,
- estimate: dt,
- createDate: new Date(),
- sourceUrl: query,
- location: location,
- description: description,
- filters: [
- { name: 'baths', value: baths },
- { name: 'price', value: price },
- { name: 'beds', value: beds },
- { name: 'type', value: type },
- { name: 'lifestyle', value: lifestyle },
- ],
- status: "requested"
- });
- console.log(res);
- } catch (err) {
- console.log(err);
- return res.status(500).json();
- }
- return res.json();
- });
- app.post("/scrapes/estimate", async (req, res) => {
- const location = req.body.location;
- const description = req.body.description;
- const price = req.body.price;
- const beds = req.body.beds;
- const type = req.body.type;
- const lifestyle = req.body.lifestyle;
- const baths = req.body.baths;
-
- var query = buildQuery(type, location, beds, baths, price, lifestyle);
-
- console.log(query);
-
- const filterPage = await axios(query);
- const html = filterPage.data;
- const $ = cheerio.load(html);
- var $pageRange = $(".pageRange");
- var pagesCount = 0;
- var resultCount = 0;
- if (!$pageRange.length) {
- let propertyLinks = $('#placardContainer .property-link').map(function () {
- return $(this).attr('href');
- }).get();
- if (!propertyLinks.length) {
- console.error("No results");
- return res.status(404).json();
- }
- resultCount = propertyLinks.length;
- } else {
- pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
- resultCount = pagesCount * 25;
- }
- const dt = new Date();
- dt.setSeconds(dt.getSeconds() + resultCount);
-
- return res.json({
- count: resultCount,
- pageCount: pagesCount,
- estimate: dt,
- });
- });
- app.patch("/scrapes/:id/execute", async (req, res) => {
- const id = req.params.id;
-
- try {
- const dbo = client.db(database);
- let collection = dbo.collection('scrapes');
-
- var o_id = new ObjectID(id);
- var newvalues = { $set: { status: "pending", startDate: new Date() } };
- await collection.updateOne({ _id: o_id }, newvalues);
- agenda.now('scrape', { _id: o_id });
- return res.status(204).json();
- } catch (err) {
- console.log(err);
- res.status(500).json();
- }
- });
-
- const port = 5501;
- app.listen(port, () => {
- console.log(`Example app listening at http://localhost:${port}`)
- });
-
- function buildQuery(type, location, beds, baths, price, lifestyle) {
- var query = `https://www.apartments.com`;
- if (type) {
- query += `/${type}`;
- }
- if (location) {
- var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase();
- query += `/${locationQuery}`;
- }
- if (beds) {
- query += `/${beds}-bedrooms`;
- }
-
- if (baths) {
- query += `${beds ? '-' : '/'}${baths}-bathrooms`;
- }
-
- if (price) {
- if (beds) {
- query += `-over-${price}`;
- } else {
- query += `/over-${price}`;
- }
- }
- if (lifestyle) {
- query += `/${lifestyle}`;
- }
- return query;
- }
-
- // Handles graceful stopping of jobs
- function graceful() {
- agenda.stop(function () {
- client.close(function (e) {
- if (e) logger.error(e);
- process.exit(0);
- });
- });
- }
-
- process.on('SIGTERM', graceful);
- process.on('SIGINT', graceful);
|