List of usage examples for org.dom4j.io SAXReader setXMLReaderClassName
public void setXMLReaderClassName(String xmlReaderClassName) throws SAXException
XMLReader
to be used to parse SAX events. From source file:com.globalsight.everest.tm.importer.TmxReader.java
License:Apache License
/** * Reads an XML file and checks its correctness by validating * against the TMX DTD. If there's any error in the file, an * exception is thrown.//ww w . jav a 2 s . c om * * As a side effect, this method builds a list of source and * target locales found in the file, including the declared source * locale from the header. */ private void analyzeXml(String p_url) throws Exception { if (m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_RTF || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_HTML || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_FM || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_FM_SGML || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_IL || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_XPTAG) { // Convert the Trados codes to native System4 codes by // converting the file to RTF, saving it as HTML and // extracting the resulting TUVs. CATEGORY.info("Converting Trados TMX to native TMX: " + p_url); p_url = convertTradosTmx(p_url, m_tmxLevel); // Now we have a new file that contains native content. m_options.setFileName(p_url); m_options.setFileType(com.globalsight.everest.tm.importer.ImportOptions.TYPE_XML); m_tmxLevel = ImportUtil.TMX_LEVEL_NATIVE; } CATEGORY.debug("Analyzing document: " + p_url); // Reset list of locales found in the file. m_sourceLocales = new HashSet(); m_targetLocales = new HashSet(); SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // Read the DTD and validate. reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_tmxVersion = element.attributeValue(Tmx.VERSION); } public void onEnd(ElementPath path) { } }); reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = new Tmx(element); m_header.setTmxVersion(m_tmxVersion); element.detach(); } }); reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // Record optional source language declared on TU. String srclang = element.attributeValue(Tmx.SRCLANG); if (srclang != null) { m_sourceLocales.add(ImportUtil.normalizeLocale(srclang)); } // Find target languages HashSet langs = new HashSet(); List tuvs = element.selectNodes("./tuv"); for (int i = 0, max = tuvs.size(); i < max; i++) { Element tuv = (Element) tuvs.get(i); String lang = tuv.attributeValue(Tmx.LANG); // Collect TUV locales langs.add(ImportUtil.normalizeLocale(lang)); } langs.remove(srclang); m_targetLocales.addAll(langs); // prune the current element to reduce memory element.detach(); } }); Document document = reader.read(p_url); // Add declared source language from header. String sourceLocale = ImportUtil.normalizeLocale(m_header.getSourceLang()); m_sourceLocales.add(sourceLocale); }
From source file:com.globalsight.everest.tm.importer.TmxReaderThread.java
License:Apache License
public void run() { try {/*from w w w . j a v a 2 s. c o m*/ SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // Read the DTD and validate. reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_tmxVersion = element.attributeValue(Tmx.VERSION); } public void onEnd(ElementPath path) { } }); reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element.detach(); m_tmx = new Tmx(element); m_tmx.setTmxVersion(m_tmxVersion); m_defaultSrcLang = ImportUtil.normalizeLocale(m_tmx.getSourceLang()); } }); // enable pruning to call me back as each Element is complete reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { m_count++; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element.detach(); m_result = m_results.hireResult(); try { // Normalize spelling of locales. normalizeTu(element); // Filter out targets not to be imported. filterTu(element); // Validate we have source and target. validateTu(element); // Create TU objects SegmentTmTu tu = createTu(element); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug(tu.toDebugString(true)); } m_result.setResultObject(tu); } catch (Throwable ex) { String msg = "Entry " + m_count + ": " + ex.getMessage(); m_result.setError(msg); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug(msg, ex); } else { CATEGORY.warn(msg); } } boolean done = m_results.put(m_result); m_result = null; // Stop reading the TMX file. if (done) { throw new ThreadDeath(); } } }); String url = m_options.getFileName(); Document document = reader.read(url); } catch (ThreadDeath ignore) { CATEGORY.info("ReaderThread: interrupted."); } catch (Throwable ignore) { // Should never happen, and I don't know how to handle // this case other than passing the exception in // m_results, which I won't do for now. CATEGORY.error("unexpected error", ignore); } finally { if (m_result != null) { m_results.fireResult(m_result); } m_results.producerDone(); m_results = null; CATEGORY.debug("ReaderThread: done."); } }
From source file:com.globalsight.everest.tm.util.TmxAnalyzer.java
License:Apache License
public void analyze(String p_url) throws Exception { m_tuCount = 0;// w w w . jav a2s. c o m m_tuvCount = 0; m_localeCount = 0; m_locales = new HashSet(); m_tmxVersion = ""; m_tmx = null; SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Analyzing document: " + p_url); reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_tmxVersion = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element.detach(); m_tmx = new Tmx(element); m_tmx.setTmxVersion(m_tmxVersion); } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_tuCount; if (m_tuCount % 1000 == 0) { log("TU " + m_tuCount); } } public void onEnd(ElementPath path) { Element element = path.getCurrent(); List tuvs = element.selectNodes("//tuv"); m_tuvCount += tuvs.size(); for (int i = 0, max = tuvs.size(); i < max; i++) { Element tuv = (Element) tuvs.get(i); String locale = tuv.attributeValue("lang"); m_locales.add(locale); } // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); m_localeCount = m_locales.size(); log("File: " + p_url); log("TMX version: " + m_tmxVersion); log("Total TUs: " + m_tuCount); log("Total TUVs: " + m_tuvCount); log("Total Locales: " + m_localeCount); for (Iterator it = m_locales.iterator(); it.hasNext();) { String locale = (String) it.next(); log(locale); } // all done }
From source file:com.globalsight.everest.tm.util.TmxLevelSplitter.java
License:Apache License
public void split(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); m_entryCount = 0;/* w w w . j ava 2 s.co m*/ SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Splitting document `" + p_url + "'"); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = element; try { startFiles(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (containsTags(element)) { writeTagsEntry(element.asXML()); m_tagsCount++; } else { writeTextEntry(element.asXML()); m_textCount++; } // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); closeFiles(); log("Processed " + m_entryCount + " TUs, " + m_textCount + " level 1 (text), " + m_tagsCount + " level 2 (tags)"); // all done }
From source file:com.globalsight.everest.tm.util.TmxSplitter.java
License:Apache License
public void split(String p_url, String p_numEntries) throws Exception { final int maxEntries = Integer.parseInt(p_numEntries); final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); m_entryCount = 0;/*from w w w . j a va 2s.co m*/ SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Splitting document `" + p_url + "'"); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = element; try { startFile(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; if (m_entryCount % maxEntries == 0) { try { closeFile(); startFile(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } } } public void onEnd(ElementPath path) { Element element = path.getCurrent(); writeEntry(element.asXML()); // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); closeFile(); // all done }
From source file:com.globalsight.everest.tm.util.trados.TradosFmSgmlTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. */// w w w. ja va 2 s. c o m public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { final public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } final public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { final public void onStart(ElementPath path) { } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { final public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { final public void onStart(ElementPath path) { } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { String gxml = handleTuv(element); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs " + "into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosFmTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *//*from w w w . jav a 2 s. c om*/ public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { String gxml = handleTuv(element); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs " + "into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosHtmlTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *//*from ww w .j a va 2s.com*/ public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { element = removeUtElements(element); String gxml = handleTuv(element.getText()); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosTmxToRtf.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *///from w ww .ja v a 2s . c om public String convertToRtf(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to RTF: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); Element prop = (Element) element.selectSingleNode("/prop[@type='RTFFontTable']"); if (prop != null) writeEntry(prop.getText()); prop = (Element) element.selectSingleNode("/prop[@type='RTFStyleSheet']"); if (prop != null) writeEntry(prop.getText()); writeOtherRtfHeader(); writeDummyParagraph(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element = removeUtElements(element); writeEntry(replaceUnicodeChars(removeRtfParagraphs(element.asXML()))); writeEntry("\\par"); // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "'"); return m_filename; }
From source file:com.globalsight.everest.tm.util.ttx.TtxClean.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *//*from w w w . j ava2 s . c o m*/ public String cleanTtx(String p_url, boolean p_cleanTarget, String p_encoding) throws Exception { m_cleanTarget = p_cleanTarget; // File is called <file>.<ext>.<ttx> final String origName = getBaseName(p_url); final String baseName = getBaseName(origName); final String extension = getExtension(origName); info("Cleaning TTX file to " + (m_cleanTarget ? "target" : "source") + ": `" + p_url + "'"); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); //reader.setEntityResolver(DtdResolver.getInstance()); //reader.setValidation(true); // Fetch the version info early. reader.addHandler("/TRADOStag", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue(Ttx.VERSION); } public void onEnd(ElementPath path) { } }); // Fetch the header info early. reader.addHandler("/TRADOStag/FrontMatter", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); } }); // Read in the entire file (it's not too big normally). Document document = reader.read(p_url); Element body = (Element) document.getRootElement().selectSingleNode("//Body/Raw"); // Remove <ut>, <df> and pull out one TUV. processBody(body); String content = getInnerText(body); String encoding; if (m_cleanTarget) { if (p_encoding != null) { encoding = p_encoding; } else { encoding = "UTF-8"; } } else { // reuse original encoding encoding = m_header.getOriginalEncoding(); } String locale; if (m_cleanTarget) { locale = m_header.getTargetLanguage(); } else { locale = m_header.getSourceLanguage(); } startOutputFile(baseName, locale, extension, encoding); writeEntry(content); closeOutputFile(); info("Result written to file `" + m_filename + "'."); return m_filename; }