List of usage examples for org.jsoup.parser Parser xmlParser
public static Parser xmlParser()
From source file:Main.java
public static void main(String[] args) throws Exception { String xmlStr = "<style>" + // "v\\:* {behavior:url(#default#VML);}" + // "o\\:* {behavior:url(#default#VML);}" + // "w\\:* {behavior:url(#default#VML);}" + // ".shape {behavior:url(#default#VML);}" + // "</style>" + // "<xml>" + // "<w:WordDocument>" + // "<w:View>Normal</w:View>" + // "<w:Zoom>0</w:Zoom>" + // "<w:TrackMoves>false</w:TrackMoves>" + // "</xml>"; Document doc = Jsoup.parse(xmlStr, "", Parser.xmlParser()); doc.select("style").remove(); System.out.println(doc);/*from ww w .j a v a 2s . co m*/ }
From source file:DataCrawler.OpenAIRE.XMLGenerator.java
public static void main(String[] args) { String text = ""; try {//from w w w. j a va2 s . co m if (args.length < 4) { System.out.println("<command> template_file csv_file output_dir log_file [start_id]"); } // InputStream fis = new FileInputStream("E:/Downloads/result-r-00000"); InputStream fis = new FileInputStream(args[1]); BufferedReader br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8"))); // String content = new String(Files.readAllBytes(Paths.get("publications_template.xml"))); String content = new String(Files.readAllBytes(Paths.get(args[0]))); Document doc = Jsoup.parse(content, "UTF-8", Parser.xmlParser()); // String outputDirectory = "G:/"; String outputDirectory = args[2]; // PrintWriter logWriter = new PrintWriter(new FileOutputStream("publication.log",false)); PrintWriter logWriter = new PrintWriter(new FileOutputStream(args[3], false)); Element objectId = null, title = null, publisher = null, dateofacceptance = null, bestlicense = null, resulttype = null, originalId = null, originalId2 = null; boolean start = true; // String startID = "dedup_wf_001::207a098867b64f3b5af505fa3aeecd24"; String startID = ""; if (args.length >= 5) { start = false; startID = args[4]; } String previousText = ""; while ((text = br.readLine()) != null) { /* For publications: 0. dri:objIdentifier context 9. title context 12. publisher context 18. dateofacceptance 19. bestlicense @classname 21. resulttype @classname 26. originalId context (Notice that the prefix is null and will use space to separate two different "originalId") */ if (!previousText.isEmpty()) { text = previousText + text; start = true; previousText = ""; } String[] items = text.split("!"); for (int i = 0; i < items.length; ++i) { items[i] = StringUtils.strip(items[i], "#"); } if (objectId == null) objectId = doc.getElementsByTag("dri:objIdentifier").first(); objectId.text(items[0]); if (!start && items[0].equals(startID)) { start = true; } if (title == null) title = doc.getElementsByTag("title").first(); title.text(items[9]); if (publisher == null) publisher = doc.getElementsByTag("publisher").first(); if (items.length < 12) { previousText = text; continue; } publisher.text(items[12]); if (dateofacceptance == null) dateofacceptance = doc.getElementsByTag("dateofacceptance").first(); dateofacceptance.text(items[18]); if (bestlicense == null) bestlicense = doc.getElementsByTag("bestlicense").first(); bestlicense.attr("classname", items[19]); if (resulttype == null) resulttype = doc.getElementsByTag("resulttype").first(); resulttype.attr("classname", items[21]); if (originalId == null || originalId2 == null) { Elements elements = doc.getElementsByTag("originalId"); String[] context = items[26].split(" "); if (elements.size() > 0) { if (elements.size() >= 1) { originalId = elements.get(0); if (context.length >= 1) { int indexOfnull = context[0].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[0].trim().length() >= (indexOfnull + 5)) value = context[0].trim().substring(indexOfnull + 5); } else { value = context[0].trim(); } originalId.text(value); } } if (elements.size() >= 2) { originalId2 = elements.get(1); if (context.length >= 2) { int indexOfnull = context[1].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[1].trim().length() >= (indexOfnull + 5)) value = context[1].trim().substring(indexOfnull + 5); } else { value = context[1].trim(); } originalId2.text(value); } } } } else { String[] context = items[26].split(" "); if (context.length >= 1) { int indexOfnull = context[0].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[0].trim().length() >= (indexOfnull + 5)) value = context[0].trim().substring(indexOfnull + 5); } else { value = context[0].trim(); } originalId.text(value); } if (context.length >= 2) { int indexOfnull = context[1].trim().indexOf("null"); String value = ""; if (indexOfnull != -1) { if (context[1].trim().length() >= (indexOfnull + 5)) value = context[1].trim().substring(indexOfnull + 5); } else { value = context[1].trim(); } originalId2.text(value); } } if (start) { String filePath = outputDirectory + items[0].replace(":", "#") + ".xml"; PrintWriter writer = new PrintWriter(new FileOutputStream(filePath, false)); logWriter.write(filePath + " > Start" + System.lineSeparator()); writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + System.lineSeparator()); writer.write(doc.getElementsByTag("response").first().toString()); writer.close(); logWriter.write(filePath + " > OK" + System.lineSeparator()); logWriter.flush(); } } logWriter.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:me.vertretungsplan.parser.IndiwareDemoTest.java
@Test public void demoTestXML() throws IOException { SubstitutionScheduleDay schedule = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()), false); verify(schedule);/*w w w . j av a 2 s .c om*/ }
From source file:com.sastix.cms.server.services.content.impl.GeneralFileHandlerServiceImpl.java
@Override public String findParentFile(String xml) { String ret = null;// www . j ava2 s. c om Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); for (Element e : doc.select("resources")) { ret = e.select("resource").get(0).attr("href"); } return ret; }
From source file:com.johan.vertretungsplan.additionalinfo.WinterShParser.java
@Override public AdditionalInfo getAdditionalInfo() throws IOException, JSONException { AdditionalInfo info = new AdditionalInfo(); String xml = httpGet(URL, ENCODING); Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); String text = doc.select("item description").first().text(); if (text.contains("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")) { text = "keine Informationen"; info.setHasInformation(false);/*from w ww . ja v a 2 s .c o m*/ } info.setText(text); info.setTitle(TITLE + " (Stand: " + doc.select("pubDate").first().text() + ")"); return info; }
From source file:me.vertretungsplan.parser.IndiwareDemoTest.java
@Test public void testEquals() throws IOException { SubstitutionScheduleDay scheduleXML = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()), false);/* w ww .ja v a2 s .co m*/ SubstitutionScheduleDay scheduleHTML = parser.parseIndiwareDay(Jsoup.parse(html), true); assertEquals(scheduleXML, scheduleHTML); }
From source file:me.vertretungsplan.parser.IndiwareStundenplan24Parser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { String baseurl;//from w w w. ja va 2 s.c om if (data.has("schoolNumber")) { baseurl = "http://www.stundenplan24.de/" + data.getString("schoolNumber") + "/vplan/"; if (credential == null || !(credential instanceof UserPasswordCredential)) { throw new IOException("no login"); } String login = ((UserPasswordCredential) credential).getUsername(); String password = ((UserPasswordCredential) credential).getPassword(); executor.auth(login, password); } else { baseurl = data.getString("baseurl") + "/"; new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); } List<Document> docs = new ArrayList<>(); for (int i = 0; i < MAX_DAYS; i++) { LocalDate date = LocalDate.now().plusDays(i); String dateStr = DateTimeFormat.forPattern("yyyyMMdd").print(date); String url = baseurl + "vdaten/VplanKl" + dateStr + ".xml?_=" + System.currentTimeMillis(); try { String xml = httpGet(url, ENCODING); Document doc = Jsoup.parse(xml, url, Parser.xmlParser()); if (doc.select("kopf datei").text().equals("VplanKl" + dateStr + ".xml")) { docs.add(doc); } } catch (HttpResponseException e) { if (e.getStatusCode() != 404 && e.getStatusCode() != 300) throw e; } } SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); for (Document doc : docs) { v.addDay(parseIndiwareDay(doc, false)); } v.setWebsite(baseurl); v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }
From source file:com.sastix.cms.server.services.content.impl.ZipFileHandlerServiceImpl.java
@Override public boolean isScormType(Map<String, byte[]> bytesMap) { byte[] dataXml = bytesMap.get(METADATA_XML_FILE); if (dataXml == null) { return false; } else {//from www . java 2s . c o m String xml = ""; String ret = ""; try { xml = new String(dataXml, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error("Error in determining if it is a scorm type: {}", e.getLocalizedMessage()); } Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); for (Element e : doc.select("resources")) { ret = e.select("resource").get(0).attr("adlcp:scormType"); } return ret.equals("sco"); } }
From source file:com.svi.uzabase.logic.ValidationProcess.java
private List<XMLHolder> extractXML() { System.out.println("in extracting xml"); BufferedReader brInput;/*from www . ja v a 2 s. c o m*/ String sCurrentLineInput; String[] fieldNo; String[] splitter; String toValidate; String str; List<XMLHolder> xmlBatchHolder = new ArrayList<>(); XMLHolder xmlFileHolder; Field xmlField; org.jsoup.nodes.Document doc; progress = new AtomicInteger(0); total = new AtomicInteger(xmlHolder.size()); mf.setJprogressValues(total, progress); for (String xmlPath : xmlHolder) { mf.loader("Extracting XML: ", false); xmlFileHolder = new XMLHolder(); xmlFileHolder.setFileName(xmlPath); try { brInput = new BufferedReader(new FileReader(xmlPath)); while ((sCurrentLineInput = brInput.readLine()) != null) { str = sCurrentLineInput; if (str.contains("field no=\"")) { xmlField = new Field(); sCurrentLineInput = brInput.readLine(); fieldNo = str.split("\""); str = sCurrentLineInput; doc = Jsoup.parse(str, "", Parser.xmlParser()); toValidate = doc.select("value").text(); if (fieldNo.length < 1) { xmlField.setFieldNo(0); } else { xmlField.setFieldNo(Integer.parseInt(fieldNo[1])); } xmlField.setValue(toValidate); if (!toValidate.isEmpty()) { xmlFileHolder.add(xmlField); } } } brInput.close(); xmlBatchHolder.add(xmlFileHolder); } catch (FileNotFoundException ex) { Logger.getLogger(ValidationProcess.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(ValidationProcess.class.getName()).log(Level.SEVERE, null, ex); } } //Set field name based on the schema for (XMLHolder h : xmlBatchHolder) { for (Field f : h) { for (SchemaFields s : schemaFieldsList) { if (f.getFieldNo() == s.getFieldNo()) { f.setFieldName(s.getFieldName()); break; } } } } //Set field types for validation FS fs; for (XMLHolder h : xmlBatchHolder) { fs = new FS(); fs.setFileName(h.getFileName()); for (Field f : h) { if (f.getFieldName().toLowerCase().contains("nationality")) { f.setType("nationality"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("company name")) { f.setType("corporation"); } else if (f.getFieldName().toLowerCase().contains("position")) { f.setType("position"); } else if (f.getFieldName().toLowerCase().contains("directors/officers") && f.getFieldName().toLowerCase().contains("name")) { f.setType("name"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("directors/officers") && f.getFieldName().toLowerCase().contains("tin")) { f.setType("tin"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("directors/officers") && f.getFieldName().toLowerCase().contains("stockholder")) { f.setType("stockholder"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("directors/officers") && !f.getFieldName().toLowerCase().contains("how many")) { splitter = f.getFieldName().split(":", 2); if (splitter[1].trim().equals("Board") || splitter[1].trim().equals("Officer")) { f.setType("board"); } else { f.setType("none"); } f.setColumnHeader(f.getFieldName()); // } else if (f.getFieldName().equalsIgnoreCase("address")) { } else if (f.getFieldNo() == 31) { f.setType("city"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 32) { f.setType("province"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("phone number")) { f.setType("tel"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("fax number")) { f.setType("fax"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("contact person")) { f.setType("person"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("e-mail address")) { f.setType("email"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("website/url address")) { f.setType("website"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("tin/passport no.")) { f.setType("tin"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("% ownership")) { f.setType("ownership"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldName().toLowerCase().contains("share type")) { f.setType("shareType"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 1) { f.setType("sec"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 1207) { f.setType("tin"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 7) { if (!f.getValue().equals("*N/A")) { fs.setTotalAssets(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setType("assets"); f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 9) { f.setType("liabilities"); if (!f.getValue().equals("*N/A")) { fs.setTotalLiabilities(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 11) { f.setType("balanceSheet"); if (!f.getValue().equals("*N/A")) { fs.setTotalShareholderEquity(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 3) { f.setType("grossc"); if (!f.getValue().equals("*N/A")) { fs.setGrossRevenue(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 4) { f.setType("gross"); if (!f.getValue().equals("*N/A")) { fs.setGrossRevenueP(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 5) { f.setType("netIncome"); if (!f.getValue().equals("*N/A")) { fs.setNetIncome(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 6) { f.setType("netIncomeP"); if (!f.getValue().equals("*N/A")) { fs.setNetIncomeP(Double.parseDouble(f.getValue().replaceAll(",", ""))); } f.setColumnHeader(f.getFieldName()); } else if (f.getFieldNo() == 1201) { f.setType("purpose"); } else if (f.getFieldNo() == 1206) { f.setType("periodCovered"); } else if (f.getFieldNo() == 1209) { f.setType("fiscalYear"); } else { f.setType("none"); } } fsData.add(fs); } List<String> foundDupAlready = new ArrayList<>(); String tempFoundDup = ""; int dupCtr = 0; for (XMLHolder h : xmlBatchHolder) { for (Field f1 : h) { if (f1.getValue().equals("*N/A")) { //skip if value is *N/A continue; } for (Field f2 : h) { if (f2.getValue().equals("*N/A")) { //skip if value is *N/A continue; } if (f1.getValue().equals(f2.getValue()) && !f1.getFieldName().toLowerCase().contains("company") && !f2.getFieldName().toLowerCase().contains("company") && f1.getFieldName().toLowerCase().contains("name") && f2.getFieldName().toLowerCase().contains("name") //Check if board or stockholder only && ((f1.getFieldName().toLowerCase().contains("directors/officers") && f2.getFieldName().toLowerCase().contains("directors/officers")) || ((f1.getFieldName().toLowerCase().contains("name") && !f1.getFieldName().toLowerCase().contains("directors/officers")) && (f2.getFieldName().toLowerCase().contains("name") && !f2 .getFieldName().toLowerCase().contains("directors/officers")))) //end of condition to check && !f1.getValue().isEmpty() && !tempFoundDup.equals(f2.getValue())) { dupCtr++; if (dupCtr == 2 && foundDupAlready.indexOf(f1.getValue()) < 0) { if (f2.getFieldName().toLowerCase().contains("former") || f2.getFieldName().toLowerCase().contains("building")) { continue; } System.out.println("2 " + f2.getValue() + f2.getFieldName()); foundDupAlready.add(f2.getValue()); tempFoundDup = f2.getValue(); f2.add("Duplicate entry"); } } } dupCtr = 0; } } List<Field> dupeHolder = new ArrayList<>(); for (XMLHolder z : xmlBatchHolder) { for (Field f1 : z) { for (String s : f1) { if (s.equals("Duplicate entry")) { dupeHolder.add(f1); } } } } for (XMLHolder z : xmlBatchHolder) { for (Field f1 : z) { for (Field fd : dupeHolder) { if (fd.getFieldNo() != f1.getFieldNo() && fd.getValue().equals(f1.getValue())) { f1.add("Duplicate entry"); } } } } return validateData(xmlBatchHolder); }
From source file:com.sastix.cms.server.services.content.impl.ZipFileHandlerServiceImpl.java
@Override public String findParentResource(Map<String, byte[]> bytesMap) { byte[] dataXml = bytesMap.get(METADATA_XML_FILE); if (dataXml == null) { return null; }/* w ww. ja v a 2 s. c o m*/ String xml = ""; String ret = ""; try { xml = new String(dataXml, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error("Error in finding parent resource name: {}", e.getLocalizedMessage()); } Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); for (Element e : doc.select("resources")) { ret = e.select("resource").get(0).attr("href"); } return ret; }