Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
MeltedHugo committed Nov 13, 2018
1 parent 203b44b commit 270be10
Show file tree
Hide file tree
Showing 5 changed files with 800 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# JSONs
json

# Logs
logs
*.log
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
# hab-schedule-parser
Parses the current schedule of lectures at the University of Applied Sciences Aschaffenburg to JSON files

## Requirements
* Node.js with npm
* An internet connection
* Permission to create folders in the current directory

## How to use
1. Clone this repository.
2. Run `npm install` to install the dependencies.
3. Run `npm start` or `node main.js` to start the program.
263 changes: 263 additions & 0 deletions main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
const request = require('request'); // To get website data
const fs = require('fs'); // To output files
var cheerio = require('cheerio'); // To work with html data
var cheerioTableparser = require('cheerio-tableparser'); // The most useful tool to parse those pesky tables.
var he = require('he'); // To decode HTML entities to utf-8 strings

if (!fs.existsSync('./json')){
console.log("Creating directory: ./json")
fs.mkdirSync('./json');
}

getEverything();

function getPlan(kategorie){

// Open timetable website of class
request('https://www.h-ab.de/fileadmin/dokumente/stundenplan/klassisch/'+kategorie+'.html', function (error, response, body) {

// If website is offline
if(error){
console.log("Error fetching website "+kategorie);
return;
}

$ = cheerio.load(body);

// Remove footnotes
$( ".fn" ).remove();
$( ".fnft" ).remove();
$('span').remove();

var plan = {};
var vl = [];
var tblNumber = 0;

// For each week:
$('table').each(function () {
tblNumber += 1;

// Parse week
var parsed = parseWeek(this);

plan[tblNumber] = parsed[0];
vl.push(parsed[1]);

})

// Done
var currTime = new Date();
planFinal = {timeCreated: currTime, plan: plan};
vlFinal = {timeCreated: currTime, vl: vl};
makePlainLists(kategorie, vlFinal.vl);


/*********************************
* THE MAIN PARSING HAPPENS HERE *
********************************/

function parseWeek(week) {
$w = cheerio.load(week);
cheerioTableparser($w);
var data = $w("table").parsetable(true, false, false);
// Read more about the cheerio tableparser here:
// https://www.npmjs.com/package/cheerio-tableparser


var woche= [];
for(var i=3;i<data.length;i++){
// Get rid of empty or meaningless cells (There are a lot of them)
var col = data[i].filter(function (el) {
return el != "&#xA0;"&& el != "";
});

if(col.length>1){
woche.push(col);
}
}

// Remove time column
woche.splice(-1,1);

// Sort "double lectures" (that happen at the same time at the same day simultaneously) into correct day array
for(var i=1;i<woche.length;i++){
if (woche[i][0] === woche[i-1][0]){
for (var j=1;j<woche[i].length;j++){
woche[i-1].push(woche[i][j])
}
woche[i]="";
}
}

// Remove empty elements
var woche = woche.filter(function (el) {
return el != "";
});

// We now have a nice array (woche) for that week.
// This needs to me converted into a well-readable object

var weekObject = {};
var vorlesungArray = [];
var weekNum = 0;

woche.forEach(function(tag){
var vorlesungen = tag.slice(1);
var vorlesungObject = {};

vorlesungen.forEach(function(vorlesung){
// This checks if there is enough information. Some courses don't provide much information, this helps mitigate this problem.
if(vorlesung.split("<br>").length<4){
uhrzeit = returnLastItem(vorlesung.split("<br>")[0].split(">"));
name = he.decode(vorlesung.split("<br>")[1]);
gruppe = he.decode(returnLastItem(vorlesung.split("<br>")).split("</a>")[0].split("<")[0]);
prof = "";
raum = "";
// This is the normal behavior:
} else {
uhrzeit = returnLastItem(vorlesung.split("<br>")[0].split(">"));
name = he.decode(vorlesung.split("<br>")[1]);
prof = he.decode(vorlesung.split("<br>")[2]);
raum = he.decode(vorlesung.split("<br>")[3].split("<")[0]);
gruppe = he.decode(returnLastItem(vorlesung.split("<br>")).split("</a>")[0].split("<")[0]);
}

vorlesungObject = {
vorlesung: name,
datum: tag[0],
zeit: uhrzeit,
dozent: prof,
raum: raum,
gruppe: gruppe,
timestamp: parseDate(tag[0],uhrzeit)
}

vorlesungArray.push(vorlesungObject)
})

weekObject[weekNum] = vorlesungObject;
weekNum++;
});


// Returns the last item of an array without modifying it
function returnLastItem(array){
return array[array.length-1];
}

// Parses the date and time from strings and returns a timestamp
function parseDate(tag,uhrzeit){
var origStr = tag;
var origTime = uhrzeit;

//var oDay = parseInt(origStr.split(",")[0]); // This just returns the name of the day. Useless.
var oDate = parseInt(origStr.split(", ")[1].split(".")[0]);
var oMonth = parseInt(origStr.split(", ")[1].split(".")[1])-1;



var oTimeStart = origTime.split(" - ")[0];
var oTimeStartHours = parseInt(oTimeStart.split(":")[0]);
var oTimeStartMinutes = parseInt(oTimeStart.split(":")[1]);


var parsedDate = new Date();
parsedDate.setMonth(oMonth);
parsedDate.setDate(oDate);
parsedDate.setHours(oTimeStartHours);
parsedDate.setMinutes(oTimeStartMinutes);
parsedDate.setSeconds(0);
parsedDate.setMilliseconds(0);

parsedDate.setFullYear(fixYear(parsedDate));

// Since years are not provided in the table, we need to determine which year it is talking about.
// Usually is the current year, but can also be next year.
function fixYear(parsedDate){
var currentDate = new Date();
var currentYear = currentDate.getFullYear();
var difference = currentDate - parsedDate;
// If the script assumes it's too far in the past, then it must be in the future (i.e. next year)
if(difference > 12960000000){
correctYear = currentYear + 1;
} else {
correctYear = currentYear;
}
return correctYear;
}

return parsedDate;
}


return [weekObject,vorlesungArray];
}


});
}


// Check which courses exist.
function getEverything(){
// This site has a handy list of all currently running classes:
request('https://www.h-ab.de/fileadmin/dokumente/stundenplan/klassisch/index.html', function (error, response, body) {

$ = cheerio.load(body);

var list = [];

// A prototype method to remove an element from an array by specifying a string
Array.prototype.remove = function() {
var what, a = arguments, L = a.length, ax;
while (L && this.length) {
what = a[--L];
while ((ax = this.indexOf(what)) !== -1) {
this.splice(ax, 1);
}
}
return this;
};

// Collect the short names of the classes (the significant part of the respective URLs)
$('a').each(function(){
var link = $(this).attr("href").split(".html")[0];
list.push(link);
})

// This gets parsed too, but since it's not a class, we don't want this.
list.remove("http://www.sked.de");

// Output the list to a file.
fs.writeFileSync("json/CLASSLIST.json", JSON.stringify(list));

// Now do some work
list.forEach(function(kat){
getPlan(kat,list);
})
})
}

// The last step: Cleaning up.
function makePlainLists(kategorie, all){
var upcoming = [];

// Get rid of the weeks
all.forEach(function(woche){
woche.forEach(function(vl){
upcoming.push(vl);
})
})

// Sort the whole thing by date
upcoming.sort(function(a,b){
return new Date(a.timestamp) - new Date(b.timestamp);
})

// Output to file
if (upcoming.length > 0) {
fs.writeFileSync("json/"+kategorie+".json", JSON.stringify(upcoming));
console.log("Generated successfully: "+kategorie+".json");
}

}
Loading

0 comments on commit 270be10

Please sign in to comment.