Java tutorial
/* * substitution-schedule-parser - Java library for parsing schools' substitution schedules * Copyright (c) 2016 Johan v. Forstner * * This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. * If a copy of the MPL was not distributed with this file, You can obtain one at https://mozilla.org/MPL/2.0/. */ package me.vertretungsplan.parser; import me.vertretungsplan.exception.CredentialInvalidException; import me.vertretungsplan.objects.Substitution; import me.vertretungsplan.objects.SubstitutionSchedule; import me.vertretungsplan.objects.SubstitutionScheduleData; import me.vertretungsplan.objects.SubstitutionScheduleDay; import org.apache.http.NameValuePair; import org.apache.http.message.BasicNameValuePair; import org.jetbrains.annotations.NotNull; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.Tag; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Parser for substitution schedules in HTML format created by the <a href="http://www.haneke.de/SvPlan.html">svPlan</a> * software. * <p> * Example: <a href="http://www.ratsschule.de/Vplan/PH_heute.htm">Ratsschule Melle</a> * <p> * This parser can be accessed using <code>"svplan"</code> for {@link SubstitutionScheduleData#setApi(String)}. * * <h4>Configuration parameters</h4> * These parameters can be supplied in {@link SubstitutionScheduleData#setData(JSONObject)} to configure the parser: * * <dl> * <dt><code>urls</code> (Array of Strings, required)</dt> * <dd>The URLs of the HTML files of the schedule. There is one file for each day.</dd> * * <dt><code>encoding</code> (String, required)</dt> * <dd>The charset of the HTML files. It's probably either UTF-8 or ISO-8859-1.</dd> * * <dt><code>classes</code> (Array of Strings, required)</dt> * <dd>The list of all classes, as they can appear in the schedule</dd> * * <dt><code>classSeparator</code> (String, optional, Default: <code>", "</code>)</dt> * <dd>The string with which multiple classes are separated.</dd> * * <dt><code>excludeTeachers</code> (Boolean, optional, Default: <code>false</code>)</dt> * <dd>Don't show teachers on the schedule.</dd> * </dl> * * Additionally, this parser supports the parameters specified in {@link LoginHandler} for login-protected schedules. */ public class SVPlanParser extends BaseParser { private static final String PARAM_URLS = "urls"; private static final String PARAM_ENCODING = "encoding"; private static final String PARAM_CLASS_SEPARATOR = "classSeparator"; private static final String PARAM_EXCLUDE_TEACHERS = "excludeTeachers"; private JSONObject data; public SVPlanParser(SubstitutionScheduleData scheduleData, CookieProvider cookieProvider) { super(scheduleData, cookieProvider); data = scheduleData.getData(); } public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); // JSONArray urls = data.getJSONArray(PARAM_URLS); String encoding = data.optString(PARAM_ENCODING, null); List<Document> docs = new ArrayList<>(); for (int i = 0; i < urls.length(); i++) { String url; if (urls.get(i) instanceof JSONObject) { // backwards compatibility final JSONObject obj = urls.getJSONObject(i); url = obj.getString("url"); if (obj.has("postData")) { JSONObject postParams = obj.getJSONObject("postData"); List<NameValuePair> nvps = new ArrayList<>(); for (String name : JSONObject.getNames(postParams)) { String value = postParams.getString(name); nvps.add(new BasicNameValuePair(name, value)); } docs.add(Jsoup.parse(httpPost(url, encoding, nvps).replace(" ", ""))); } else { docs.add(Jsoup.parse(httpGet(url, encoding).replace(" ", ""))); } } else { url = urls.getString(i); docs.add(Jsoup.parse(httpGet(url, encoding).replace(" ", ""))); } } SubstitutionSchedule v = parseSVPlanSchedule(docs); return v; } @NotNull SubstitutionSchedule parseSVPlanSchedule(List<Document> docs) throws IOException, JSONException { SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); for (Document doc : docs) { if (doc.select(".svp").size() > 0) { for (Element svp : doc.select(".svp")) { parseSvPlanDay(v, svp, doc); } } else if (doc.select(".Trennlinie").size() > 0) { Element div = new Element(Tag.valueOf("div"), ""); for (Node node : doc.body().childNodesCopy()) { if (node instanceof Element && ((Element) node).hasClass("Trennlinie") && div.select("table").size() > 0) { parseSvPlanDay(v, div, doc); div = new Element(Tag.valueOf("div"), ""); } else { div.appendChild(node); } } parseSvPlanDay(v, div, doc); } else { parseSvPlanDay(v, doc, doc); } } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; } private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0 || doc.title().startsWith("Vertretungsplan fr "))) { setDate(svp, doc, day); if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) { Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr"); String lastLesson = ""; String lastClass = ""; for (Element row : rows) { if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header")) || row.select("th").size() > 0 || row.text().trim().equals("")) { continue; } Substitution substitution = new Substitution(); for (Element column : row.select("td")) { String type = column.className(); if (!hasData(column.text())) { if ((type.startsWith("svp-stunde") || type.startsWith("Stunde")) && hasData(lastLesson)) { substitution.setLesson(lastLesson); } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse")) && hasData(lastClass)) { substitution.getClasses().addAll(Arrays .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); } continue; } if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) { substitution.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) { substitution.getClasses().addAll(Arrays .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); lastClass = column.text(); } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setPreviousTeacher(column.text()); } } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setTeacher(column.text().replaceAll(" \\+$", "")); } } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) { substitution.setSubject(column.text()); } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) { substitution.setDesc(column.text()); String recognizedType = recognizeType(column.text()); substitution.setType(recognizedType); substitution.setColor(colorProvider.getColor(recognizedType)); } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) { substitution.setRoom(column.text()); } } if (substitution.getType() == null) { substitution.setType("Vertretung"); substitution.setColor(colorProvider.getColor("Vertretung")); } day.addSubstitution(substitution); } } if (svp.select(".LehrerVerplant").size() > 0) { day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text()); } if (svp.select(".Abwesenheiten").size() > 0) { day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text()); } if (svp.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = svp.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } sibling = sibling.nextElementSibling(); } } else if (svp.select(".Mitteilungen").size() > 0) { for (Element p : svp.select(".Mitteilungen")) { for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } } } v.addDay(day); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } } private void setDate(Element svp, Document doc, SubstitutionScheduleDay day) { String date = "Unbekanntes Datum"; if (svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0) { date = svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").text() .replaceAll("Vertretungsplan (fr )?", "").trim(); } else if (doc.title().startsWith("Vertretungsplan fr ")) { date = doc.title().substring("Vertretungsplan fr ".length()); } date = date.replaceAll("\\s+", " "); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); if (svp.select(".svp-uploaddatum, .Stand").size() > 0) { String lastChange = svp.select(".svp-uploaddatum, .Stand").text().replace("Aktualisierung: ", "") .replace("Stand: ", ""); day.setLastChangeString(lastChange); day.setLastChange(ParserUtils.parseDateTime(lastChange)); } } private void loadUrl(String url, String encoding, List<Document> docs) throws IOException, CredentialInvalidException { } public List<String> getAllClasses() throws JSONException { return getClassesFromJson(); } @Override public List<String> getAllTeachers() { return null; } private boolean hasData(String text) { return !text.trim().equals("") && !text.trim().equals("---"); } }