Example usage for org.jsoup.parser Parser xmlParser

List of usage examples for org.jsoup.parser Parser xmlParser

Introduction

In this page you can find the example usage for org.jsoup.parser Parser xmlParser.

Prototype

public static Parser xmlParser() 

Source Link

Document

Create a new XML parser.

Usage

From source file:Main.java

public static void main(String[] args) throws Exception {
    String xmlStr = "<style>" + //
            "v\\:* {behavior:url(#default#VML);}" + //
            "o\\:* {behavior:url(#default#VML);}" + //
            "w\\:* {behavior:url(#default#VML);}" + //
            ".shape {behavior:url(#default#VML);}" + //
            "</style>" + //
            "<xml>" + //
            "<w:WordDocument>" + //
            "<w:View>Normal</w:View>" + //
            "<w:Zoom>0</w:Zoom>" + //
            "<w:TrackMoves>false</w:TrackMoves>" + //
            "</xml>";
    Document doc = Jsoup.parse(xmlStr, "", Parser.xmlParser());
    doc.select("style").remove();
    System.out.println(doc);/*from  ww  w .j a v  a 2s  . co m*/
}

From source file:DataCrawler.OpenAIRE.XMLGenerator.java

public static void main(String[] args) {
    String text = "";

    try {//from  w  w  w.  j  a  va2  s .  co m
        if (args.length < 4) {
            System.out.println("<command> template_file csv_file output_dir log_file [start_id]");
        }

        // InputStream fis = new FileInputStream("E:/Downloads/result-r-00000");
        InputStream fis = new FileInputStream(args[1]);
        BufferedReader br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

        // String content = new String(Files.readAllBytes(Paths.get("publications_template.xml")));
        String content = new String(Files.readAllBytes(Paths.get(args[0])));
        Document doc = Jsoup.parse(content, "UTF-8", Parser.xmlParser());
        // String outputDirectory = "G:/";
        String outputDirectory = args[2];
        // PrintWriter logWriter = new PrintWriter(new FileOutputStream("publication.log",false));
        PrintWriter logWriter = new PrintWriter(new FileOutputStream(args[3], false));
        Element objectId = null, title = null, publisher = null, dateofacceptance = null, bestlicense = null,
                resulttype = null, originalId = null, originalId2 = null;
        boolean start = true;
        // String startID = "dedup_wf_001::207a098867b64f3b5af505fa3aeecd24";
        String startID = "";
        if (args.length >= 5) {
            start = false;
            startID = args[4];
        }
        String previousText = "";
        while ((text = br.readLine()) != null) {
            /*  For publications:
                0. dri:objIdentifier context
               9. title context
               12. publisher context
               18. dateofacceptance
               19. bestlicense @classname
               21. resulttype  @classname
               26. originalId context  
               (Notice that the prefix is null and will use space to separate two different "originalId")
            */

            if (!previousText.isEmpty()) {
                text = previousText + text;
                start = true;
                previousText = "";
            }

            String[] items = text.split("!");
            for (int i = 0; i < items.length; ++i) {
                items[i] = StringUtils.strip(items[i], "#");
            }
            if (objectId == null)
                objectId = doc.getElementsByTag("dri:objIdentifier").first();
            objectId.text(items[0]);

            if (!start && items[0].equals(startID)) {
                start = true;
            }

            if (title == null)
                title = doc.getElementsByTag("title").first();
            title.text(items[9]);

            if (publisher == null)
                publisher = doc.getElementsByTag("publisher").first();

            if (items.length < 12) {
                previousText = text;
                continue;
            }
            publisher.text(items[12]);

            if (dateofacceptance == null)
                dateofacceptance = doc.getElementsByTag("dateofacceptance").first();
            dateofacceptance.text(items[18]);

            if (bestlicense == null)
                bestlicense = doc.getElementsByTag("bestlicense").first();
            bestlicense.attr("classname", items[19]);

            if (resulttype == null)
                resulttype = doc.getElementsByTag("resulttype").first();
            resulttype.attr("classname", items[21]);

            if (originalId == null || originalId2 == null) {
                Elements elements = doc.getElementsByTag("originalId");
                String[] context = items[26].split(" ");
                if (elements.size() > 0) {
                    if (elements.size() >= 1) {
                        originalId = elements.get(0);
                        if (context.length >= 1) {
                            int indexOfnull = context[0].trim().indexOf("null");
                            String value = "";
                            if (indexOfnull != -1) {
                                if (context[0].trim().length() >= (indexOfnull + 5))
                                    value = context[0].trim().substring(indexOfnull + 5);

                            } else {
                                value = context[0].trim();
                            }
                            originalId.text(value);
                        }
                    }
                    if (elements.size() >= 2) {
                        originalId2 = elements.get(1);
                        if (context.length >= 2) {
                            int indexOfnull = context[1].trim().indexOf("null");
                            String value = "";
                            if (indexOfnull != -1) {
                                if (context[1].trim().length() >= (indexOfnull + 5))
                                    value = context[1].trim().substring(indexOfnull + 5);

                            } else {
                                value = context[1].trim();
                            }
                            originalId2.text(value);
                        }
                    }
                }
            } else {
                String[] context = items[26].split(" ");
                if (context.length >= 1) {
                    int indexOfnull = context[0].trim().indexOf("null");
                    String value = "";
                    if (indexOfnull != -1) {
                        if (context[0].trim().length() >= (indexOfnull + 5))
                            value = context[0].trim().substring(indexOfnull + 5);

                    } else {
                        value = context[0].trim();
                    }
                    originalId.text(value);
                }
                if (context.length >= 2) {
                    int indexOfnull = context[1].trim().indexOf("null");
                    String value = "";
                    if (indexOfnull != -1) {
                        if (context[1].trim().length() >= (indexOfnull + 5))
                            value = context[1].trim().substring(indexOfnull + 5);

                    } else {
                        value = context[1].trim();
                    }
                    originalId2.text(value);
                }
            }
            if (start) {
                String filePath = outputDirectory + items[0].replace(":", "#") + ".xml";
                PrintWriter writer = new PrintWriter(new FileOutputStream(filePath, false));
                logWriter.write(filePath + " > Start" + System.lineSeparator());
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + System.lineSeparator());
                writer.write(doc.getElementsByTag("response").first().toString());
                writer.close();
                logWriter.write(filePath + " > OK" + System.lineSeparator());
                logWriter.flush();
            }

        }
        logWriter.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:me.vertretungsplan.parser.IndiwareDemoTest.java

@Test
public void demoTestXML() throws IOException {
    SubstitutionScheduleDay schedule = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()), false);
    verify(schedule);/*w  w w . j av  a  2 s  .c  om*/
}

From source file:com.sastix.cms.server.services.content.impl.GeneralFileHandlerServiceImpl.java

@Override
public String findParentFile(String xml) {
    String ret = null;// www  . j  ava2  s.  c om
    Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
    for (Element e : doc.select("resources")) {
        ret = e.select("resource").get(0).attr("href");
    }
    return ret;
}

From source file:com.johan.vertretungsplan.additionalinfo.WinterShParser.java

@Override
public AdditionalInfo getAdditionalInfo() throws IOException, JSONException {
    AdditionalInfo info = new AdditionalInfo();
    String xml = httpGet(URL, ENCODING);
    Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
    String text = doc.select("item description").first().text();
    if (text.contains("Aktuell gibt es keine Hinweise auf witterungsbedingten Unterrichtsausfall.")) {
        text = "keine Informationen";
        info.setHasInformation(false);/*from  w  ww  .  ja  v a  2 s .c o  m*/
    }
    info.setText(text);
    info.setTitle(TITLE + " (Stand: " + doc.select("pubDate").first().text() + ")");
    return info;
}

From source file:me.vertretungsplan.parser.IndiwareDemoTest.java

@Test
public void testEquals() throws IOException {
    SubstitutionScheduleDay scheduleXML = parser.parseIndiwareDay(Jsoup.parse(xml, "", Parser.xmlParser()),
            false);/* w  ww  .ja  v a2 s .co  m*/
    SubstitutionScheduleDay scheduleHTML = parser.parseIndiwareDay(Jsoup.parse(html), true);
    assertEquals(scheduleXML, scheduleHTML);
}

From source file:me.vertretungsplan.parser.IndiwareStundenplan24Parser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {

    String baseurl;//from w  w w. ja  va  2 s.c om
    if (data.has("schoolNumber")) {
        baseurl = "http://www.stundenplan24.de/" + data.getString("schoolNumber") + "/vplan/";
        if (credential == null || !(credential instanceof UserPasswordCredential)) {
            throw new IOException("no login");
        }
        String login = ((UserPasswordCredential) credential).getUsername();
        String password = ((UserPasswordCredential) credential).getPassword();
        executor.auth(login, password);
    } else {
        baseurl = data.getString("baseurl") + "/";
        new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);
    }

    List<Document> docs = new ArrayList<>();

    for (int i = 0; i < MAX_DAYS; i++) {
        LocalDate date = LocalDate.now().plusDays(i);
        String dateStr = DateTimeFormat.forPattern("yyyyMMdd").print(date);
        String url = baseurl + "vdaten/VplanKl" + dateStr + ".xml?_=" + System.currentTimeMillis();
        try {
            String xml = httpGet(url, ENCODING);
            Document doc = Jsoup.parse(xml, url, Parser.xmlParser());
            if (doc.select("kopf datei").text().equals("VplanKl" + dateStr + ".xml")) {
                docs.add(doc);
            }
        } catch (HttpResponseException e) {
            if (e.getStatusCode() != 404 && e.getStatusCode() != 300)
                throw e;
        }
    }

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    for (Document doc : docs) {
        v.addDay(parseIndiwareDay(doc, false));
    }

    v.setWebsite(baseurl);

    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());

    return v;
}

From source file:com.sastix.cms.server.services.content.impl.ZipFileHandlerServiceImpl.java

@Override
public boolean isScormType(Map<String, byte[]> bytesMap) {
    byte[] dataXml = bytesMap.get(METADATA_XML_FILE);
    if (dataXml == null) {
        return false;
    } else {//from  www  . java  2s .  c  o m
        String xml = "";
        String ret = "";
        try {
            xml = new String(dataXml, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            LOG.error("Error in determining if it is a scorm type: {}", e.getLocalizedMessage());
        }
        Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
        for (Element e : doc.select("resources")) {
            ret = e.select("resource").get(0).attr("adlcp:scormType");
        }
        return ret.equals("sco");
    }
}

From source file:com.svi.uzabase.logic.ValidationProcess.java

private List<XMLHolder> extractXML() {
    System.out.println("in extracting xml");
    BufferedReader brInput;/*from  www . ja  v  a 2 s. c o  m*/
    String sCurrentLineInput;
    String[] fieldNo;
    String[] splitter;
    String toValidate;
    String str;
    List<XMLHolder> xmlBatchHolder = new ArrayList<>();
    XMLHolder xmlFileHolder;
    Field xmlField;
    org.jsoup.nodes.Document doc;
    progress = new AtomicInteger(0);
    total = new AtomicInteger(xmlHolder.size());
    mf.setJprogressValues(total, progress);
    for (String xmlPath : xmlHolder) {
        mf.loader("Extracting XML: ", false);
        xmlFileHolder = new XMLHolder();
        xmlFileHolder.setFileName(xmlPath);
        try {
            brInput = new BufferedReader(new FileReader(xmlPath));
            while ((sCurrentLineInput = brInput.readLine()) != null) {
                str = sCurrentLineInput;
                if (str.contains("field no=\"")) {
                    xmlField = new Field();
                    sCurrentLineInput = brInput.readLine();
                    fieldNo = str.split("\"");
                    str = sCurrentLineInput;
                    doc = Jsoup.parse(str, "", Parser.xmlParser());
                    toValidate = doc.select("value").text();
                    if (fieldNo.length < 1) {
                        xmlField.setFieldNo(0);
                    } else {
                        xmlField.setFieldNo(Integer.parseInt(fieldNo[1]));
                    }
                    xmlField.setValue(toValidate);
                    if (!toValidate.isEmpty()) {
                        xmlFileHolder.add(xmlField);
                    }
                }
            }
            brInput.close();
            xmlBatchHolder.add(xmlFileHolder);
        } catch (FileNotFoundException ex) {
            Logger.getLogger(ValidationProcess.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(ValidationProcess.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    //Set field name based on the schema
    for (XMLHolder h : xmlBatchHolder) {
        for (Field f : h) {
            for (SchemaFields s : schemaFieldsList) {
                if (f.getFieldNo() == s.getFieldNo()) {
                    f.setFieldName(s.getFieldName());
                    break;
                }
            }
        }
    }
    //Set field types for validation
    FS fs;
    for (XMLHolder h : xmlBatchHolder) {
        fs = new FS();
        fs.setFileName(h.getFileName());
        for (Field f : h) {
            if (f.getFieldName().toLowerCase().contains("nationality")) {
                f.setType("nationality");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("company name")) {
                f.setType("corporation");
            } else if (f.getFieldName().toLowerCase().contains("position")) {
                f.setType("position");
            } else if (f.getFieldName().toLowerCase().contains("directors/officers")
                    && f.getFieldName().toLowerCase().contains("name")) {
                f.setType("name");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("directors/officers")
                    && f.getFieldName().toLowerCase().contains("tin")) {
                f.setType("tin");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("directors/officers")
                    && f.getFieldName().toLowerCase().contains("stockholder")) {
                f.setType("stockholder");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("directors/officers")
                    && !f.getFieldName().toLowerCase().contains("how many")) {
                splitter = f.getFieldName().split(":", 2);
                if (splitter[1].trim().equals("Board") || splitter[1].trim().equals("Officer")) {
                    f.setType("board");
                } else {
                    f.setType("none");
                }
                f.setColumnHeader(f.getFieldName());
                //                } else if (f.getFieldName().equalsIgnoreCase("address")) {
            } else if (f.getFieldNo() == 31) {
                f.setType("city");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 32) {
                f.setType("province");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("phone number")) {
                f.setType("tel");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("fax number")) {
                f.setType("fax");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("contact person")) {
                f.setType("person");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("e-mail address")) {
                f.setType("email");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("website/url address")) {
                f.setType("website");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("tin/passport no.")) {
                f.setType("tin");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("% ownership")) {
                f.setType("ownership");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldName().toLowerCase().contains("share type")) {
                f.setType("shareType");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 1) {
                f.setType("sec");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 1207) {
                f.setType("tin");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 7) {
                if (!f.getValue().equals("*N/A")) {
                    fs.setTotalAssets(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setType("assets");
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 9) {
                f.setType("liabilities");
                if (!f.getValue().equals("*N/A")) {
                    fs.setTotalLiabilities(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 11) {
                f.setType("balanceSheet");
                if (!f.getValue().equals("*N/A")) {
                    fs.setTotalShareholderEquity(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 3) {
                f.setType("grossc");
                if (!f.getValue().equals("*N/A")) {
                    fs.setGrossRevenue(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 4) {
                f.setType("gross");
                if (!f.getValue().equals("*N/A")) {
                    fs.setGrossRevenueP(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 5) {
                f.setType("netIncome");
                if (!f.getValue().equals("*N/A")) {
                    fs.setNetIncome(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 6) {
                f.setType("netIncomeP");
                if (!f.getValue().equals("*N/A")) {
                    fs.setNetIncomeP(Double.parseDouble(f.getValue().replaceAll(",", "")));
                }
                f.setColumnHeader(f.getFieldName());
            } else if (f.getFieldNo() == 1201) {
                f.setType("purpose");
            } else if (f.getFieldNo() == 1206) {
                f.setType("periodCovered");
            } else if (f.getFieldNo() == 1209) {
                f.setType("fiscalYear");
            } else {
                f.setType("none");
            }
        }
        fsData.add(fs);
    }

    List<String> foundDupAlready = new ArrayList<>();
    String tempFoundDup = "";
    int dupCtr = 0;
    for (XMLHolder h : xmlBatchHolder) {
        for (Field f1 : h) {
            if (f1.getValue().equals("*N/A")) {
                //skip if value is *N/A
                continue;
            }
            for (Field f2 : h) {
                if (f2.getValue().equals("*N/A")) {
                    //skip if value is *N/A
                    continue;
                }
                if (f1.getValue().equals(f2.getValue()) && !f1.getFieldName().toLowerCase().contains("company")
                        && !f2.getFieldName().toLowerCase().contains("company")
                        && f1.getFieldName().toLowerCase().contains("name")
                        && f2.getFieldName().toLowerCase().contains("name")
                        //Check if board or stockholder only
                        && ((f1.getFieldName().toLowerCase().contains("directors/officers")
                                && f2.getFieldName().toLowerCase().contains("directors/officers"))
                                || ((f1.getFieldName().toLowerCase().contains("name")
                                        && !f1.getFieldName().toLowerCase().contains("directors/officers"))
                                        && (f2.getFieldName().toLowerCase().contains("name") && !f2
                                                .getFieldName().toLowerCase().contains("directors/officers"))))
                        //end of condition to check
                        && !f1.getValue().isEmpty() && !tempFoundDup.equals(f2.getValue())) {
                    dupCtr++;
                    if (dupCtr == 2 && foundDupAlready.indexOf(f1.getValue()) < 0) {
                        if (f2.getFieldName().toLowerCase().contains("former")
                                || f2.getFieldName().toLowerCase().contains("building")) {
                            continue;
                        }
                        System.out.println("2 " + f2.getValue() + f2.getFieldName());
                        foundDupAlready.add(f2.getValue());
                        tempFoundDup = f2.getValue();
                        f2.add("Duplicate entry");
                    }
                }
            }
            dupCtr = 0;
        }
    }
    List<Field> dupeHolder = new ArrayList<>();
    for (XMLHolder z : xmlBatchHolder) {
        for (Field f1 : z) {
            for (String s : f1) {
                if (s.equals("Duplicate entry")) {
                    dupeHolder.add(f1);
                }
            }
        }
    }
    for (XMLHolder z : xmlBatchHolder) {
        for (Field f1 : z) {
            for (Field fd : dupeHolder) {
                if (fd.getFieldNo() != f1.getFieldNo() && fd.getValue().equals(f1.getValue())) {
                    f1.add("Duplicate entry");
                }
            }
        }
    }

    return validateData(xmlBatchHolder);
}

From source file:com.sastix.cms.server.services.content.impl.ZipFileHandlerServiceImpl.java

@Override
public String findParentResource(Map<String, byte[]> bytesMap) {
    byte[] dataXml = bytesMap.get(METADATA_XML_FILE);
    if (dataXml == null) {
        return null;
    }/*  w ww. ja v  a 2  s. c o  m*/
    String xml = "";
    String ret = "";
    try {
        xml = new String(dataXml, "UTF-8");
    } catch (UnsupportedEncodingException e) {
        LOG.error("Error in finding parent resource name: {}", e.getLocalizedMessage());
    }
    Document doc = Jsoup.parse(xml, "", Parser.xmlParser());
    for (Element e : doc.select("resources")) {
        ret = e.select("resource").get(0).attr("href");
    }
    return ret;
}