From 6f5be838cf260984c47c2b06082ddd9d10de0cf3 Mon Sep 17 00:00:00 2001 From: Bram van der Veen <96aa48@gmail.com> Date: Sat, 13 Jun 2015 14:38:55 +0200 Subject: [PATCH] Made a seperate function for ripping, to add more clarity --- spider.js | 142 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 78 insertions(+), 64 deletions(-) diff --git a/spider.js b/spider.js index 58db206..136df74 100644 --- a/spider.js +++ b/spider.js @@ -10,82 +10,31 @@ var scheduletypes = [ 'Lokaalrooster' ]; var schoolid; +var database; //Function for getting pages with http requests. -function get(database) { - - var collection = database.collection('index'); - collection.drop(); +function get() { + database.collection('index').drop(); for (scheduletype of scheduletypes) { + (function (scheduletype) { var link = 'http://roosters5.gepro-osi.nl/roosters/rooster.php?school=' + schoolid + '&type=' + scheduletype; - scheduletype = scheduletype.replace(/rooster/g, '').toLowerCase(); - http.get(link, function (res) { - var _download = ''; + + var _download = {}; + _download.type = scheduletype; res.on('data', function (data) { - _download += data; + _download.data += data; }); res.on('end', function () { - var list = extract(_download); - - if (scheduletype == 'leerling') { - for(studentcategory of list) { - (function (studentcategory) { - - http.get('http://' + res.req.socket._host + res.req.path + '&afdeling=' + studentcategory, function (res) { - var _download = ''; - - res.on('data', function (data) { - _download += iconv.decode(data, 'binary'); - }); - - res.on('end', function () { - var list_students = cheerio('select', _download).children(); - - for (student in list_students) { - if (!isNaN(student)) { - var name = cheerio(list_students[student]).text().split(' - ')[1]; - var id = parseInt(cheerio(list_students[student]).val()); - - var database_entry = { - 'id' : id, - 'username' : id + name.split(' ')[0].toLowerCase(), - 'full_name' : name, - 'first_name' : name.split(' ')[0], - 'last_name' : name.split(' ').splice(1).join(' '), - 'studentcategory' : studentcategory, - 'type' : scheduletype - } - - - collection.insert(database_entry, showOutput); - if (studentcategory == list[list.length - 1] && student == list_students.length - 1) { - database.close(); - } - } - } - }); - }); - })(studentcategory); - } - } - else { - for (entry of list) { - var database_entry = { - 'name' : entry, - 'type' : scheduletype - } - collection.insert(database_entry, showOutput); - } - } - + rip(_download); }); + }); })(scheduletype); } @@ -98,15 +47,80 @@ function extract(page) { return array.splice(1, array.length - 2); } +//Function for ripping all of the information +function rip(data) { + var list = extract(data.data); + var collection = database.collection('index'); + + if (data.type == 'Leerlingrooster') { + + for(studentcategory of list) { + + (function (studentcategory) { + + http.get('http://roosters5.gepro-osi.nl/roosters/rooster.php?school=' + schoolid + '&type=' + data.type + '&afdeling=' + studentcategory, function (res) { + var _download = ''; + + res.on('data', function (data) { + _download += iconv.decode(data, 'binary'); + }); + + res.on('end', function () { + var list_students = cheerio('select', _download).children(); + + for (student in list_students) { + + if (!isNaN(student)) { + var name = cheerio(list_students[student]).text().split(' - ')[1]; + var id = parseInt(cheerio(list_students[student]).val()); + data.type = data.type.replace(/rooster/g, '').toLowerCase(); + var database_entry = { + 'id' : id, + 'username' : id + name.split(' ')[0].toLowerCase(), + 'full_name' : name, + 'first_name' : name.split(' ')[0], + 'last_name' : name.split(' ').splice(1).join(' '), + 'studentcategory' : studentcategory, + 'type' : data.type + } + + collection.insert(database_entry, showOutput); + + if (studentcategory == list[list.length - 1] && student == list_students.length - 1) { + database.close(); + } + + } + } + }); + }); + })(studentcategory); + } + } + else { + for (entry of list) { + var database_entry = { + 'name' : entry, + 'type' : data.type + } + + collection.insert(database_entry, showOutput); + } + } +} + //Function being called to access functionality from this module. function crawl(sid) { schoolid = sid; - mongodb.connect('mongodb://wallpiece/roosterio', function (error, database) { + mongodb.connect('mongodb://wallpiece/roosterio', function (error, db) { if (error) console.warn(error); - get(database); - }) + database = db; + + get(); + }); } + //Redundant function for draining native-mongodb-driver output function showOutput(error, message) { if (process.argv[3] == '-v') {