You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

app.js 8.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. const config = require('config');
  2. const axios = require("axios");
  3. const cheerio = require("cheerio");
  4. const express = require('express');
  5. const cors = require('cors');
  6. var stream = require('stream');
  7. var jsonxml = require('json2xml');
  8. var beautify = require('xml-beautifier');
  9. const MongoClient = require('mongodb').MongoClient;
  10. const ObjectID = require('mongodb').ObjectID;
  11. var apartments = require('./apartments.js');
  12. var houses = require('./houses.js');
  13. const distinct = (value, index, self) => {
  14. return self.indexOf(value) === index;
  15. }
  16. // jobs
  17. var mongoUrl = config.get("mongo");
  18. var agendaDb = config.get("agenda");
  19. const Agenda = require('agenda').Agenda;
  20. const agenda = new Agenda({ db: { address: agendaDb } });
  21. agenda.define('scrape', async function (job, done) {
  22. const { _id } = job.attrs.data;
  23. try {
  24. const dbo = client.db(database);
  25. let collection = dbo.collection('scrapes');
  26. let scrape = await collection.findOne({ _id: _id });
  27. // for (var page = 1; page <= scrape.pageCount; page++) {
  28. for (var page = 1; page <= 1; page++) {
  29. console.log("scrapping page " + page)
  30. const filterPage = await axios(scrape.sourceUrl + `/${page}`);
  31. const html = filterPage.data;
  32. const $ = cheerio.load(html);
  33. const propertyLinks = $('#placardContainer .property-link').map(function () {
  34. return $(this).attr('href');
  35. }).get();
  36. const links = propertyLinks.filter(distinct);
  37. var properties = [];
  38. // var limit = 0;
  39. for (const link of links) {
  40. // if(limit == 1) break;
  41. try {
  42. var response = await axios(link);
  43. var property = apartments.apartment(cheerio.load(response.data));
  44. property.url = link;
  45. properties.push(property);
  46. console.log(`${link} scraped.`)
  47. } catch (err) {
  48. console.error(`${link} scrape failed.`);
  49. }
  50. }
  51. }
  52. await collection.updateOne({ _id: _id }, { $set: { status: "done", result: properties, endDate: new Date() } });
  53. console.log(`${_id} scraped.`);
  54. return done();
  55. } catch (err) {
  56. console.log(err);
  57. }
  58. });
  59. (async function () {
  60. await agenda.start();
  61. })();
  62. // express application
  63. const app = express();
  64. app.use(express.json());
  65. app.use(cors());
  66. // database setup
  67. var mongoUrl = config.get("mongo");
  68. var database = config.get("database");
  69. var client = undefined;
  70. MongoClient.connect(mongoUrl, function (err, db) {
  71. if (err) throw err;
  72. console.log("Database created!");
  73. console.log(mongoUrl);
  74. client = db;
  75. var dbo = db.db(database);
  76. dbo.createCollection("scrapes", function (err, res) {
  77. if (err) {
  78. console.log("Collection already created!");
  79. return;
  80. }
  81. console.log("Collection created!");
  82. });
  83. });
  84. app.get("/", async (req, res) => {
  85. return res.json("ok");
  86. });
  87. app.get("/scrapes/:id/files/xml", async (req, res) => {
  88. const id = req.params.id;
  89. try {
  90. const dbo = client.db(database);
  91. let collection = dbo.collection('scrapes');
  92. var o_id = new ObjectID(id);
  93. let data = await collection.findOne({ _id: o_id });
  94. // console.log("Data", data)
  95. var input = {
  96. ...data,
  97. _id: data._id.toString(),
  98. estimate: data.estimate?.toString(),
  99. createDate: data.estimate?.toString()
  100. };
  101. let xml = jsonxml(input, { header: true })
  102. xml.concat('<root>')
  103. // console.log("XML", xml)
  104. var formattedXml = beautify(xml);
  105. // console.log("formattedXml", formattedXml)
  106. var fileContents = Buffer.from(formattedXml);
  107. var readStream = new stream.PassThrough();
  108. readStream.end(fileContents);
  109. res.set('Content-disposition', 'attachment; filename=' + id + '.xml');
  110. res.set('Content-Type', 'text/plain');
  111. readStream.pipe(res);
  112. } catch (err) {
  113. console.log(err);
  114. res.status(500).json();
  115. }
  116. });
  117. app.get("/scrapes", async (req, res) => {
  118. try {
  119. const dbo = client.db(database);
  120. let collection = dbo.collection('scrapes');
  121. let data = await collection.find({}).toArray();
  122. return res.json(data);
  123. } catch (err) {
  124. console.log(err);
  125. return res.status(500).json();
  126. }
  127. });
  128. app.get("/scrapes/:id", async (req, res) => {
  129. const id = req.params.id;
  130. try {
  131. const dbo = client.db(database);
  132. let collection = dbo.collection('scrapes');
  133. var o_id = new ObjectID(id);
  134. let data = await collection.findOne({ _id: o_id });
  135. return res.json(data);
  136. } catch (err) {
  137. console.log(err);
  138. res.status(500).json();
  139. }
  140. });
  141. app.post("/scrapes/", async (req, res) => {
  142. const location = req.body.location;
  143. const description = req.body.description;
  144. const price = req.body.price;
  145. const beds = req.body.beds;
  146. const type = req.body.type;
  147. const lifestyle = req.body.lifestyle;
  148. const baths = req.body.baths;
  149. // query builder
  150. var query = buildQuery(type, location, beds, baths, price, lifestyle);
  151. console.log(query);
  152. const filterPage = await axios(query);
  153. const html = filterPage.data;
  154. const $ = cheerio.load(html);
  155. var $pageRange = $(".pageRange");
  156. var pagesCount = 0;
  157. var resultCount = 0;
  158. if (!$pageRange.length) {
  159. let propertyLinks = $('#placardContainer .property-link').map(function () {
  160. return $(this).attr('href');
  161. }).get();
  162. if (!propertyLinks.length) {
  163. console.error("No results");
  164. return res.status(404).json();
  165. }
  166. resultCount = propertyLinks.length;
  167. } else {
  168. pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
  169. resultCount = pagesCount * 25;
  170. }
  171. try {
  172. const dbo = client.db(database);
  173. let collection = dbo.collection('scrapes');
  174. const dt = new Date();
  175. dt.setSeconds(dt.getSeconds() + resultCount);
  176. let res = await collection.insertOne({
  177. count: resultCount,
  178. pageCount: pagesCount,
  179. estimate: dt,
  180. createDate: new Date(),
  181. sourceUrl: query,
  182. location: location,
  183. description: description,
  184. filters: [
  185. { name: 'baths', value: baths },
  186. { name: 'price', value: price },
  187. { name: 'beds', value: beds },
  188. { name: 'type', value: type },
  189. { name: 'lifestyle', value: lifestyle },
  190. ],
  191. status: "requested"
  192. });
  193. console.log(res);
  194. } catch (err) {
  195. console.log(err);
  196. return res.status(500).json();
  197. }
  198. return res.json();
  199. });
  200. app.post("/scrapes/estimate", async (req, res) => {
  201. const location = req.body.location;
  202. const description = req.body.description;
  203. const price = req.body.price;
  204. const beds = req.body.beds;
  205. const type = req.body.type;
  206. const lifestyle = req.body.lifestyle;
  207. const baths = req.body.baths;
  208. var query = buildQuery(type, location, beds, baths, price, lifestyle);
  209. console.log(query);
  210. const filterPage = await axios(query);
  211. const html = filterPage.data;
  212. const $ = cheerio.load(html);
  213. var $pageRange = $(".pageRange");
  214. var pagesCount = 0;
  215. var resultCount = 0;
  216. if (!$pageRange.length) {
  217. let propertyLinks = $('#placardContainer .property-link').map(function () {
  218. return $(this).attr('href');
  219. }).get();
  220. if (!propertyLinks.length) {
  221. console.error("No results");
  222. return res.status(404).json();
  223. }
  224. resultCount = propertyLinks.length;
  225. } else {
  226. pagesCount = $pageRange.text().slice($pageRange.text().lastIndexOf("of ") + 3);
  227. resultCount = pagesCount * 25;
  228. }
  229. const dt = new Date();
  230. dt.setSeconds(dt.getSeconds() + resultCount);
  231. return res.json({
  232. count: resultCount,
  233. pageCount: pagesCount,
  234. estimate: dt,
  235. });
  236. });
  237. app.patch("/scrapes/:id/execute", async (req, res) => {
  238. const id = req.params.id;
  239. try {
  240. const dbo = client.db(database);
  241. let collection = dbo.collection('scrapes');
  242. var o_id = new ObjectID(id);
  243. var newvalues = { $set: { status: "pending", startDate: new Date() } };
  244. await collection.updateOne({ _id: o_id }, newvalues);
  245. agenda.now('scrape', { _id: o_id });
  246. return res.status(204).json();
  247. } catch (err) {
  248. console.log(err);
  249. res.status(500).json();
  250. }
  251. });
  252. const port = 5501;
  253. app.listen(port, () => {
  254. console.log(`Example app listening at http://localhost:${port}`)
  255. });
  256. function buildQuery(type, location, beds, baths, price, lifestyle) {
  257. var query = `https://www.apartments.com`;
  258. if (type) {
  259. query += `/${type}`;
  260. }
  261. if (location) {
  262. var locationQuery = location.replace(", ", "-").replace(" ", "-").toLowerCase();
  263. query += `/${locationQuery}`;
  264. }
  265. if (beds) {
  266. query += `/${beds}-bedrooms`;
  267. }
  268. if (baths) {
  269. query += `${beds ? '-' : '/'}${baths}-bathrooms`;
  270. }
  271. if (price) {
  272. if (beds) {
  273. query += `-over-${price}`;
  274. } else {
  275. query += `/over-${price}`;
  276. }
  277. }
  278. if (lifestyle) {
  279. query += `/${lifestyle}`;
  280. }
  281. return query;
  282. }
  283. // Handles graceful stopping of jobs
  284. function graceful() {
  285. agenda.stop(function () {
  286. client.close(function (e) {
  287. if (e) logger.error(e);
  288. process.exit(0);
  289. });
  290. });
  291. }
  292. process.on('SIGTERM', graceful);
  293. process.on('SIGINT', graceful);