List of usage examples for org.dom4j.io SAXReader setValidation
public void setValidation(boolean validation)
From source file:com.globalsight.everest.tm.util.TmxLevelSplitter.java
License:Apache License
public void split(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); m_entryCount = 0;/*from www . j a v a 2 s. co m*/ SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Splitting document `" + p_url + "'"); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = element; try { startFiles(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (containsTags(element)) { writeTagsEntry(element.asXML()); m_tagsCount++; } else { writeTextEntry(element.asXML()); m_textCount++; } // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); closeFiles(); log("Processed " + m_entryCount + " TUs, " + m_textCount + " level 1 (text), " + m_tagsCount + " level 2 (tags)"); // all done }
From source file:com.globalsight.everest.tm.util.TmxSplitter.java
License:Apache License
public void split(String p_url, String p_numEntries) throws Exception { final int maxEntries = Integer.parseInt(p_numEntries); final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); m_entryCount = 0;//from www.jav a 2 s .c om SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Splitting document `" + p_url + "'"); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = element; try { startFile(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; if (m_entryCount % maxEntries == 0) { try { closeFile(); startFile(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } } } public void onEnd(ElementPath path) { Element element = path.getCurrent(); writeEntry(element.asXML()); // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); closeFile(); // all done }
From source file:com.globalsight.everest.tm.util.trados.TradosFmSgmlTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *///from w w w . j a v a2s .co m public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { final public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } final public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { final public void onStart(ElementPath path) { } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { final public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { final public void onStart(ElementPath path) { } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { String gxml = handleTuv(element); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs " + "into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosFmTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. */// www . jav a 2 s . c o m public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { String gxml = handleTuv(element); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs " + "into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosHtmlTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *///from www . ja v a 2s.com public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { element = removeUtElements(element); String gxml = handleTuv(element.getText()); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosTmxToRtf.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *//*from ww w.j a v a2 s. com*/ public String convertToRtf(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to RTF: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); Element prop = (Element) element.selectSingleNode("/prop[@type='RTFFontTable']"); if (prop != null) writeEntry(prop.getText()); prop = (Element) element.selectSingleNode("/prop[@type='RTFStyleSheet']"); if (prop != null) writeEntry(prop.getText()); writeOtherRtfHeader(); writeDummyParagraph(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element = removeUtElements(element); writeEntry(replaceUnicodeChars(removeRtfParagraphs(element.asXML()))); writeEntry("\\par"); // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "'"); return m_filename; }
From source file:com.globalsight.everest.webapp.pagehandler.administration.config.xmldtd.XmlDtdManager.java
License:Apache License
/** * Validates xml files with specified dtd file. * //from w ww. j a v a 2 s.co m * @param id * The xml dtd id. * @param file * The xml file need to validate. * @throws DtdException */ public static void validateXmlFile(long id, File file) throws DtdException { Assert.assertFileExist(file); if (file.getName().endsWith(".xml")) { logger.debug("File: " + file.getPath()); File dtdFile = DtdFileManager.getDtdFile(id, file); if (dtdFile != null && dtdFile.exists()) { logger.debug("DTD: " + dtdFile.getPath()); SAXReader reader = new SAXReader(); DtdEntityResolver resolver = new DtdEntityResolver(dtdFile); reader.setEntityResolver(resolver); reader.setValidation(true); Document document; try { document = reader.read(file); document.clearContent(); logger.debug("Successful"); } catch (Exception e) { logger.info("DTD validation failed: " + e.getMessage()); throw new DtdException(e); } } } }
From source file:com.globalsight.ling.docproc.DiplomatWordCounter.java
License:Apache License
private SAXReader createXmlParser() { SAXReader result = new SAXReader(); try {/*www.j av a 2s . c om*/ result.setXMLReaderClassName("org.dom4j.io.aelfred.SAXDriver"); result.setValidation(false); } catch (Exception ex) { System.err.println("org.dom4j.io.aelfred.SAXDriver not found"); // Use the system default parser, better than nothing. result = new SAXReader(); result.setValidation(false); } return result; }
From source file:com.globalsight.smartbox.bussiness.process.Usecase01PreProcess.java
License:Apache License
private void parseFile(Vector<String> p_sourceFiles) { // Parse Bookmark XML File to get fileProfileName and job name. Map<String, String> srcMap = new HashMap<String, String>(); String fileName = null;/* w ww . java 2 s.co m*/ for (String path : p_sourceFiles) { String temp = path.substring(path.lastIndexOf(File.separator) + 1); srcMap.put(temp, path); if (temp.endsWith(".pdf")) { fileName = temp.replace(".pdf", ".xml"); } } String path = srcMap.get(fileName); try { SAXReader saxReader = new SAXReader(); saxReader.setValidation(false); saxReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); Document doc = saxReader.read(path); Element node = (Element) doc.selectSingleNode("//bookmeta/data[@datatype='GSDATA']"); String gsDataValue = node.attributeValue("value"); String jobName = gsDataValue.substring(0, gsDataValue.indexOf("~")); String customerFPName = gsDataValue.substring(gsDataValue.indexOf("~") + 1); FileProfile configFP = getConfigFP(customerFPName); if (configFP == null) { String message = "Can't find the fileProfileNameMapping for " + customerFPName; LogUtil.info(message); return; } fp = gsFPMap.get(configFP.getName()); if (fp == null) { String message = "Can't find the file profile."; LogUtil.info(message); return; } // Get Target Locale. node = (Element) doc.selectSingleNode("/bookmap"); String lang = node.attributeValue("lang"); for (String locale : fp.getTargetLocale()) { if (locale.startsWith(lang)) { trgLocale = locale; break; } } if (trgLocale == null || trgLocale.trim().length() == 0) { String message = "Can't find the correct Target Locale in File Profile."; LogUtil.info(message); return; } unExtractedFP = gsFPMap.get(configFP.getGsUnExtractedFPName()); if (unExtractedFP == null) { String message = "Can't find the UnExtracted file profile: " + fp.getGsUnExtractedFPName(); LogUtil.info(message); return; } basicJobName = jobName + "_" + customerFPName + "_" + lang; } catch (Exception e) { String message = "Read XML error: " + path; LogUtil.fail(message, e); return; } }
From source file:com.globalsight.smartbox.bussiness.process.Usecase04PreProcess.java
License:Apache License
private void parseFile(Vector<String> p_sourceFiles) { String path = null;/*from ww w . ja v a2s. c o m*/ try { // Parse Bookmark XML File to get fileProfileName and job name. path = getBookMapFile(p_sourceFiles); SAXReader saxReader = new SAXReader(); saxReader.setValidation(false); saxReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); Document doc = saxReader.read(path); Element node = (Element) doc.selectSingleNode("//bookmeta/data[@datatype='GSDATA']"); String gsDataValue = node.attributeValue("value"); String jobName = gsDataValue.substring(0, gsDataValue.indexOf("~")); String customerFPName = gsDataValue.substring(gsDataValue.indexOf("~") + 1); FileProfile configFP = getConfigFP(customerFPName); if (configFP == null) { String message = "Can't find the fileProfileNameMapping for " + customerFPName; LogUtil.info(message); return; } fp = gsFPMap.get(configFP.getName()); if (fp == null) { String message = "Can't find the file profile."; LogUtil.info(message); return; } // Get Target Locale. node = (Element) doc.selectSingleNode("/bookmap"); String lang = node.attributeValue("lang"); for (String locale : fp.getTargetLocale()) { if (locale.startsWith(lang)) { trgLocale = locale; break; } } if (trgLocale == null || trgLocale.trim().length() == 0) { String message = "Can't find the correct Target Locale in File Profile."; LogUtil.info(message); return; } unExtractedFP = gsFPMap.get(configFP.getGsUnExtractedFPName()); if (unExtractedFP == null) { String message = "Can't find the UnExtracted file profile: " + fp.getGsUnExtractedFPName(); LogUtil.info(message); return; } basicJobName = jobName + "_" + customerFPName + "_" + lang; } catch (FileNotFoundException e) { LogUtil.fail("", e); return; } catch (Exception e) { String message = "Read XML error: " + path; LogUtil.fail(message, e); return; } }