2015-08-16 13:18:01 +02:00
|
|
|
//spider.js
|
2015-08-16 13:42:14 +02:00
|
|
|
/**
|
|
|
|
* Module/script for getting students, teachers, chambers and groups from the schedule website.
|
|
|
|
* This script needs to be run before using the application, as the website will have no information to run on.
|
|
|
|
* @module spider
|
2015-08-16 13:48:08 +02:00
|
|
|
* @author Bram van der Veen <96aa48@gmail.com>
|
2015-08-16 13:42:14 +02:00
|
|
|
*/
|
2015-08-16 13:18:01 +02:00
|
|
|
|
|
|
|
//Import first-party modules.
|
2016-06-18 15:06:51 +02:00
|
|
|
const url = require('url');
|
2015-08-16 13:18:01 +02:00
|
|
|
|
|
|
|
//Import third-party modules
|
2016-06-18 15:06:51 +02:00
|
|
|
const http = require('socks5-http-client');
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const iconv = require('iconv-lite');
|
2015-08-16 13:18:01 +02:00
|
|
|
|
|
|
|
//Import self-written modules.
|
2016-06-18 15:06:51 +02:00
|
|
|
const config = require('./configuration');
|
|
|
|
const database = require('./database')();
|
2015-06-06 20:31:35 +02:00
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
//Define local variables.
|
2016-06-18 15:06:51 +02:00
|
|
|
const schoolID = config().schoolID;
|
|
|
|
|
2015-06-06 20:31:35 +02:00
|
|
|
var scheduletypes = [
|
|
|
|
'Klasrooster',
|
|
|
|
'Docentrooster',
|
|
|
|
'Leerlingrooster',
|
|
|
|
'Lokaalrooster'
|
|
|
|
];
|
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
/**
|
|
|
|
* Function for crawling the schedule site for data such as: students, teachers
|
|
|
|
* chambers and groups.
|
|
|
|
*/
|
|
|
|
function crawl() {
|
2016-06-18 15:49:50 +02:00
|
|
|
console.log('Starting to crawl the schedule pages for names, student IDs, chambers and teachers');
|
2015-06-13 14:38:55 +02:00
|
|
|
database.collection('index').drop();
|
2015-06-07 22:26:24 +02:00
|
|
|
|
2015-06-06 20:31:35 +02:00
|
|
|
for (scheduletype of scheduletypes) {
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2015-06-06 20:31:35 +02:00
|
|
|
(function (scheduletype) {
|
2015-06-07 22:30:48 +02:00
|
|
|
|
2016-06-18 15:06:51 +02:00
|
|
|
let options = url.parse('http://roosters5.gepro-osi.nl/roosters/rooster.php?school=' + schoolID + '&type=' + scheduletype);
|
2015-08-15 21:07:22 +02:00
|
|
|
options.socksPort = config().torPort;
|
|
|
|
options.socksHost = config().torHost;
|
2015-06-06 20:31:35 +02:00
|
|
|
|
2016-06-18 15:26:06 +02:00
|
|
|
http.get(options, (res) => {
|
2016-06-18 15:06:51 +02:00
|
|
|
let _download = {};
|
2015-06-13 14:38:55 +02:00
|
|
|
_download.type = scheduletype;
|
2015-06-06 20:31:35 +02:00
|
|
|
|
2016-06-18 15:26:06 +02:00
|
|
|
res.on('data', (data) => _download.data += data);
|
|
|
|
res.on('end', () => rip(_download));
|
2015-06-06 20:31:35 +02:00
|
|
|
});
|
|
|
|
})(scheduletype);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
/**
|
|
|
|
* Function for extracting the lists with useful information from the crawled pages.
|
|
|
|
* (e.g Student names/ids, Teacher codes, Chamber numbers)
|
|
|
|
* @param {String} page - A string containing a downloaded schedule page.
|
|
|
|
*/
|
2015-06-06 20:31:35 +02:00
|
|
|
function extract(page) {
|
2016-06-18 15:06:51 +02:00
|
|
|
let array = cheerio('select', page).text().split('\n');
|
2015-06-06 20:31:35 +02:00
|
|
|
return array.splice(1, array.length - 2);
|
|
|
|
}
|
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
/**
|
|
|
|
* Function for ripping all possible information from a page.
|
|
|
|
* @param {String} page - A string containing a downloaded schedule page.
|
|
|
|
*/
|
|
|
|
function rip(page) {
|
2016-06-18 15:06:51 +02:00
|
|
|
let list = extract(page.data);
|
|
|
|
let collection = database.collection('index');
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
if (page.type == 'Leerlingrooster') {
|
2016-06-18 15:49:50 +02:00
|
|
|
console.log('\nRipping a studentlist')
|
2015-06-13 14:38:55 +02:00
|
|
|
for(studentcategory of list) {
|
|
|
|
|
|
|
|
(function (studentcategory) {
|
2016-06-18 15:06:51 +02:00
|
|
|
let options = url.parse('http://roosters5.gepro-osi.nl/roosters/rooster.php?school=' + schoolID + '&type=' + page.type + '&afdeling=' + studentcategory);
|
2015-08-15 21:07:22 +02:00
|
|
|
options.socksPort = config().torPort;
|
|
|
|
options.socksHost = config().torHost;
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2016-06-18 15:26:06 +02:00
|
|
|
http.get(options, (res) => {
|
2016-06-18 15:06:51 +02:00
|
|
|
let _download = '';
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2016-06-18 15:26:06 +02:00
|
|
|
res.on('data', (data) => _download += iconv.decode(data, 'binary'));
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2016-06-18 15:26:06 +02:00
|
|
|
res.on('end', () => {
|
2016-06-18 15:06:51 +02:00
|
|
|
let listOfStudents = cheerio('select', _download).children();
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2015-08-15 21:07:22 +02:00
|
|
|
for (student in listOfStudents) {
|
2015-06-13 14:38:55 +02:00
|
|
|
|
|
|
|
if (!isNaN(student)) {
|
2016-06-18 15:06:51 +02:00
|
|
|
let name = cheerio(listOfStudents[student]).text().split(' - ')[1];
|
|
|
|
let group = cheerio(listOfStudents[student]).text().split(' - ')[0];
|
|
|
|
let id = cheerio(listOfStudents[student]).val();
|
2015-06-13 15:29:42 +02:00
|
|
|
|
2016-06-18 15:06:51 +02:00
|
|
|
let databaseEntry = {
|
2015-06-13 14:38:55 +02:00
|
|
|
'id' : id,
|
2015-07-22 11:35:21 +02:00
|
|
|
'group' : group,
|
2015-06-13 14:38:55 +02:00
|
|
|
'username' : id + name.split(' ')[0].toLowerCase(),
|
2015-06-13 15:29:42 +02:00
|
|
|
'name' : name,
|
2015-06-13 14:38:55 +02:00
|
|
|
'first_name' : name.split(' ')[0],
|
|
|
|
'last_name' : name.split(' ').splice(1).join(' '),
|
|
|
|
'studentcategory' : studentcategory,
|
2015-08-16 13:18:01 +02:00
|
|
|
'type' : page.type.replace(/rooster/g, '').toLowerCase()
|
2015-06-13 14:38:55 +02:00
|
|
|
}
|
2016-06-18 15:49:50 +02:00
|
|
|
process.stdout.write('☐');
|
2015-08-16 13:18:01 +02:00
|
|
|
collection.insert(databaseEntry);
|
2015-06-13 14:38:55 +02:00
|
|
|
|
2015-08-15 21:07:22 +02:00
|
|
|
if (studentcategory == list[list.length - 1] && student == listOfStudents.length - 1) {
|
2016-06-18 15:26:06 +02:00
|
|
|
setTimeout(() => database.close(), config().spiderTimeout);
|
2015-06-13 14:38:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
});
|
|
|
|
})(studentcategory);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
2016-06-18 15:49:50 +02:00
|
|
|
console.log('\nRipping a', page.type);
|
2015-06-13 14:38:55 +02:00
|
|
|
for (entry of list) {
|
2016-06-18 15:06:51 +02:00
|
|
|
let databaseEntry = {
|
2015-06-13 14:38:55 +02:00
|
|
|
'name' : entry,
|
2015-08-16 13:18:01 +02:00
|
|
|
'type' : page.type.replace(/rooster/g, '').toLowerCase()
|
2015-06-13 14:38:55 +02:00
|
|
|
}
|
2016-06-18 15:49:50 +02:00
|
|
|
process.stdout.write('☐');
|
2015-08-16 13:18:01 +02:00
|
|
|
collection.insert(databaseEntry);
|
2015-06-13 14:38:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
//Exporting functions as a module.
|
2015-06-07 22:30:48 +02:00
|
|
|
module.exports = {
|
|
|
|
'crawl' : crawl
|
|
|
|
}
|
2015-06-08 19:58:56 +02:00
|
|
|
|
2015-08-16 13:18:01 +02:00
|
|
|
//Testing/ripping command to be used from cli.
|
2015-08-15 21:48:38 +02:00
|
|
|
if (process.argv[2] == 'test' || process.argv[2] == 'rip') {
|
2016-06-18 15:26:06 +02:00
|
|
|
module.exports.crawl(config().schoolID);
|
2015-06-08 19:58:56 +02:00
|
|
|
}
|