List of usage examples for org.dom4j.io SAXReader read
public Document read(InputSource in) throws DocumentException
Reads a Document from the given InputSource
using SAX
From source file:com.globalsight.everest.segmentationhelper.XmlLoader.java
License:Apache License
/** * Transfer xml file into Document.// w w w. ja v a 2s. c o m * * @param file */ private static Document parserWithSAX(File file) throws Exception { SAXReader xmlReader = new SAXReader(); Document doc = null; try { doc = xmlReader.read(file); } catch (Exception e) { e.printStackTrace(); throw new Exception(e.getMessage()); } return doc; }
From source file:com.globalsight.everest.segmentationhelper.XmlLoader.java
License:Apache License
/** * Transfer xml text into Document./*from w ww . ja v a 2 s .c om*/ * * @param xmltext */ private static Document parseWithSAX(String xmltext) throws Exception { Document doc = null; StringReader sr = new StringReader(xmltext); SAXReader xmlReader = new SAXReader(); try { doc = xmlReader.read(sr); } catch (Exception e) { e.printStackTrace(); throw new Exception(e.getMessage()); } return doc; }
From source file:com.globalsight.everest.tm.importer.ImportUtil.java
License:Apache License
/** * Saves a TM file with sample validation. * //from ww w . j a v a 2 s. com * For some TM files, it vary easy to happen encoding error or xml role * error and can't be import correct. This method try to do some sample * validations for each tu. If a tu will be give up if inducing a error. * * @param fileName * @throws Exception */ public void saveTmFileWithValidation(File file, File newFile, TmProcessStatus status) throws Exception { String encoding = "UTF-8"; String outEncoding = "UTF-8"; String logEncoding = "Unicode"; String strLine = System.getProperty("line.separator"); int errorCount = 0; int totalCount = 0; long lineCounter = 0; String s = null; try { if (file.exists()) { CATEGORY.info("Validating TM file: " + newFile.getAbsolutePath()); Date startTime = new Date(); File errorFile = getErrorFile(newFile); File infoFile = getInfoFile(newFile); File logFile = getLogFile(newFile); encoding = getEncodingOfXml(file); // GBS-2932 : UTF-8 by default if (encoding == null) { encoding = "UTF-8"; } // Initialize IO. FileInputStream fIn = new FileInputStream(file); BufferedReader in = new BufferedReader(new InputStreamReader(fIn, encoding)); FileOutputStream fOut = new FileOutputStream(newFile); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fOut, outEncoding)); FileOutputStream fError = new FileOutputStream(errorFile); OutputStreamWriter error = new OutputStreamWriter(fError, logEncoding); FileOutputStream fInfo = new FileOutputStream(infoFile); BufferedWriter info = new BufferedWriter(new OutputStreamWriter(fInfo, logEncoding)); FileOutputStream fLog = new FileOutputStream(logFile); OutputStreamWriter log = new OutputStreamWriter(fLog, logEncoding); writeHead(error); writeHead(log); StringBuilder sb = new StringBuilder(); // It must be <?xml ... s = in.readLine(); s = changeXmlEncodingDec(s, outEncoding); status.addSize(s.getBytes(encoding).length); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine for encoding is " + s); } sb.append(s); sb.append(strLine); status.addSize(s.getBytes(encoding).length); // If the second line is define dtd s = in.readLine(); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine for doctype is " + s); } if (s != null && s.indexOf("<!DOCTYPE") > -1) { status.addSize(s.getBytes(encoding).length); sb.append(s); sb.append(strLine); s = in.readLine(); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine is " + s); } } else if (newFile.getName().endsWith("tmx")) { // Don't define the dtd, add it. sb.append(TMX_DTD_LINE); sb.append(strLine); } boolean isRemoved = false; int count = 0; SAXReader reader = new SAXReader(); while (s != null) { if (status.isCanceled()) { CATEGORY.info("Cancelled validating"); break; } status.addSize(s.getBytes(encoding).length); if (isHeaderStart(s) && isTradosFontTableStart(s) && isHeaderEnd(s)) { int headerEndTag = s.indexOf(">"); sb.append(s.subSequence(0, headerEndTag + 1)); int endHeaderTag = s.indexOf("</header>"); sb.append(s.substring(endHeaderTag)); sb.append(endHeaderTag); sb.append(strLine); } if (isRemoved) { if (isTradosFontTableEnd(s)) { isRemoved = false; } s = in.readLine(); continue; } if (isTuStartTag(s)) { /* The begin of the tu */ // Saves information recoded. if (sb.length() > 0) { out.write(sb.toString()); out.flush(); } sb = resetStringBuilder(sb); sb.append(s); sb.append(strLine); totalCount++; } // Validate for the tu. else if (isTuEndTag(s)) { /* The end of the tu */ sb.append(s); sb.append(strLine); String content = sb.toString(); try { /* verify the content */ reader.read(new StringReader(content)); // Saves the tu if no exception happen. out.write(content); out.flush(); } catch (Exception e) { // Give up the tu if any exception happened. error.write(content); log.write(strLine); log.write(SPLIT_LINE); log.write(Integer.toString(++errorCount)); log.write(SPLIT_LINE); log.write(strLine); log.write(content); log.write(strLine); log.write(e.getMessage()); log.write(strLine); } sb = resetStringBuilder(sb); } else if (isTradosFontTableStart(s)) { count++; isRemoved = true; } else if (count > 0 && isTradosFontTableEnd(s)) { isRemoved = false; } else if ((count > 0) && isHeaderEnd(s)) { sb.append("</header>"); sb.append(strLine); } else { // Records informations which not included in tu, first // line // etc. sb.append(s); sb.append(strLine); } s = in.readLine(); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine is " + s); } lineCounter++; } // Records informations which not included in tu and not saved // to // file. Usually it is "</body> </tmx>". if (sb.length() > 0) { out.write(sb.toString()); out.flush(); } in.close(); out.close(); CATEGORY.info("Done validating"); log.write(SPLIT_LINE + SPLIT_LINE + strLine + strLine); log.write("Error: " + errorCount + strLine); log.write("Total: " + totalCount + strLine); // Gets the cost time. Date endTime = new Date(); long costTime = endTime.getTime() - startTime.getTime(); long h = costTime / (1000 * 60 * 60); costTime = costTime % (1000 * 60 * 60); long m = costTime / (1000 * 60); costTime = costTime % (1000 * 60); long se = costTime / 1000; StringBuffer time = new StringBuffer("Cost time: "); time.append(h).append(" h ").append(m).append(" m ").append(se).append(" s "); // Recodes some sample informations. String msg = "Error: " + errorCount + strLine; info.write(msg); msg = "Total: " + totalCount + strLine; info.write(msg); info.write(time.toString()); writeFoot(error); writeFoot(log); error.flush(); error.close(); info.flush(); info.close(); log.flush(); log.close(); if (lineCounter > 10000) { CATEGORY.debug("forces jvm to perform gc when the line count reaches 10000. line count: " + lineCounter); System.gc(); } } } catch (IOException ie) { CATEGORY.error("IO Exception occured when save the tm file."); CATEGORY.error("The content of current line is " + s); CATEGORY.error("The stacktrace of the exception is ", ie); throw ie; } catch (Exception e) { CATEGORY.error("error occured when save the tm file."); CATEGORY.error("The content of current line is " + s); CATEGORY.error("The stacktrace of the exception is ", e); throw e; } status.setErrorTus(Integer.toString(errorCount)); status.setTotalTus(Integer.toString(totalCount)); }
From source file:com.globalsight.everest.tm.importer.ImportUtil.java
License:Apache License
/** * Saves a TM file with sample validation. * /*from w w w . j a va 2s .c o m*/ * For some TM files, it vary easy to happen encoding error or xml role * error and can't be import correct. This method try to do some sample * validations for each tu. If a tu will be give up if inducing a error. * * This is used for no UI display requirement. * * @param fileName * @throws Exception */ public void saveTmFileWithValidation(File file, File newFile) throws Exception { String encoding = "UTF-8"; String outEncoding = "UTF-8"; String logEncoding = "Unicode"; String strLine = System.getProperty("line.separator"); int errorCount = 0; int totalCount = 0; long lineCounter = 0; String s = null; try { if (file.exists()) { CATEGORY.info("Validating TM file: " + newFile.getAbsolutePath()); Date startTime = new Date(); File errorFile = getErrorFile(newFile); File infoFile = getInfoFile(newFile); File logFile = getLogFile(newFile); encoding = getEncodingOfXml(file); // GBS-2932 : UTF-8 by default if (encoding == null) { encoding = "UTF-8"; } // Initialize IO. FileInputStream fIn = new FileInputStream(file); BufferedReader in = new BufferedReader(new InputStreamReader(fIn, encoding)); FileOutputStream fOut = new FileOutputStream(newFile); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fOut, outEncoding)); FileOutputStream fError = new FileOutputStream(errorFile); OutputStreamWriter error = new OutputStreamWriter(fError, logEncoding); FileOutputStream fInfo = new FileOutputStream(infoFile); BufferedWriter info = new BufferedWriter(new OutputStreamWriter(fInfo, logEncoding)); FileOutputStream fLog = new FileOutputStream(logFile); OutputStreamWriter log = new OutputStreamWriter(fLog, logEncoding); writeHead(error); writeHead(log); StringBuilder sb = new StringBuilder(); // It must be <?xml ... s = in.readLine(); s = changeXmlEncodingDec(s, outEncoding); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine for encoding is " + s); } sb.append(s); sb.append(strLine); // If the second line is define dtd s = in.readLine(); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine for doctype is " + s); } if (s != null && s.indexOf("<!DOCTYPE") > -1) { sb.append(s); sb.append(strLine); s = in.readLine(); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine is " + s); } } else if (newFile.getName().endsWith("tmx")) { // Don't define the dtd, add it. sb.append(TMX_DTD_LINE); sb.append(strLine); } boolean isRemoved = false; int count = 0; SAXReader reader = new SAXReader(); while (s != null) { if (isHeaderStart(s) && isTradosFontTableStart(s) && isHeaderEnd(s)) { int headerEndTag = s.indexOf(">"); sb.append(s.subSequence(0, headerEndTag + 1)); int endHeaderTag = s.indexOf("</header>"); sb.append(s.substring(endHeaderTag)); sb.append(endHeaderTag); sb.append(strLine); } if (isRemoved) { if (isTradosFontTableEnd(s)) { isRemoved = false; } s = in.readLine(); continue; } if (isTuStartTag(s)) { /* The begin of the tu */ // Saves information recoded. if (sb.length() > 0) { out.write(sb.toString()); out.flush(); } sb = resetStringBuilder(sb); sb.append(s); sb.append(strLine); totalCount++; } // Validate for the tu. else if (isTuEndTag(s)) { /* The end of the tu */ sb.append(s); sb.append(strLine); String content = sb.toString(); try { /* verify the content */ reader.read(new StringReader(content)); // Saves the tu if no exception happen. out.write(content); out.flush(); } catch (Exception e) { // Give up the tu if any exception happened. error.write(content); log.write(strLine); log.write(SPLIT_LINE); log.write(Integer.toString(++errorCount)); log.write(SPLIT_LINE); log.write(strLine); log.write(content); log.write(strLine); log.write(e.getMessage()); log.write(strLine); } sb = resetStringBuilder(sb); } else if (isTradosFontTableStart(s)) { count++; isRemoved = true; } else if (count > 0 && isTradosFontTableEnd(s)) { isRemoved = false; } else if ((count > 0) && isHeaderEnd(s)) { sb.append("</header>"); sb.append(strLine); } else { // Records informations which not included in tu, first // line // etc. sb.append(s); sb.append(strLine); } s = in.readLine(); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug("The content of in.readLine is " + s); } lineCounter++; } // Records informations which not included in tu and not saved // to // file. Usually it is "</body> </tmx>". if (sb.length() > 0) { out.write(sb.toString()); out.flush(); } in.close(); out.close(); CATEGORY.info("Done validating"); log.write(SPLIT_LINE + SPLIT_LINE + strLine + strLine); log.write("Error: " + errorCount + strLine); log.write("Total: " + totalCount + strLine); // Gets the cost time. Date endTime = new Date(); long costTime = endTime.getTime() - startTime.getTime(); long h = costTime / (1000 * 60 * 60); costTime = costTime % (1000 * 60 * 60); long m = costTime / (1000 * 60); costTime = costTime % (1000 * 60); long se = costTime / 1000; StringBuffer time = new StringBuffer("Cost time: "); time.append(h).append(" h ").append(m).append(" m ").append(se).append(" s "); // Recodes some sample informations. String msg = "Error: " + errorCount + strLine; info.write(msg); msg = "Total: " + totalCount + strLine; info.write(msg); info.write(time.toString()); writeFoot(error); writeFoot(log); error.flush(); error.close(); info.flush(); info.close(); log.flush(); log.close(); if (lineCounter > 10000) { CATEGORY.debug("suggests jvm to perform gc when the line count reaches 10000. line count: " + lineCounter); System.gc(); } } } catch (IOException ie) { CATEGORY.error("IO Exception occured when save the tm file."); CATEGORY.error("The content of current line is " + s); CATEGORY.error("The stacktrace of the exception is ", ie); throw ie; } catch (Exception e) { CATEGORY.error("error occured when save the tm file."); CATEGORY.error("The content of current line is " + s); CATEGORY.error("The stacktrace of the exception is ", e); throw e; } }
From source file:com.globalsight.everest.tm.importer.TmxReader.java
License:Apache License
/** * Reads an XML file and checks its correctness by validating * against the TMX DTD. If there's any error in the file, an * exception is thrown.//from w w w . j a v a2s .c o m * * As a side effect, this method builds a list of source and * target locales found in the file, including the declared source * locale from the header. */ private void analyzeXml(String p_url) throws Exception { if (m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_RTF || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_HTML || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_FM || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_FM_SGML || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_IL || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_XPTAG) { // Convert the Trados codes to native System4 codes by // converting the file to RTF, saving it as HTML and // extracting the resulting TUVs. CATEGORY.info("Converting Trados TMX to native TMX: " + p_url); p_url = convertTradosTmx(p_url, m_tmxLevel); // Now we have a new file that contains native content. m_options.setFileName(p_url); m_options.setFileType(com.globalsight.everest.tm.importer.ImportOptions.TYPE_XML); m_tmxLevel = ImportUtil.TMX_LEVEL_NATIVE; } CATEGORY.debug("Analyzing document: " + p_url); // Reset list of locales found in the file. m_sourceLocales = new HashSet(); m_targetLocales = new HashSet(); SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // Read the DTD and validate. reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_tmxVersion = element.attributeValue(Tmx.VERSION); } public void onEnd(ElementPath path) { } }); reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = new Tmx(element); m_header.setTmxVersion(m_tmxVersion); element.detach(); } }); reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // Record optional source language declared on TU. String srclang = element.attributeValue(Tmx.SRCLANG); if (srclang != null) { m_sourceLocales.add(ImportUtil.normalizeLocale(srclang)); } // Find target languages HashSet langs = new HashSet(); List tuvs = element.selectNodes("./tuv"); for (int i = 0, max = tuvs.size(); i < max; i++) { Element tuv = (Element) tuvs.get(i); String lang = tuv.attributeValue(Tmx.LANG); // Collect TUV locales langs.add(ImportUtil.normalizeLocale(lang)); } langs.remove(srclang); m_targetLocales.addAll(langs); // prune the current element to reduce memory element.detach(); } }); Document document = reader.read(p_url); // Add declared source language from header. String sourceLocale = ImportUtil.normalizeLocale(m_header.getSourceLang()); m_sourceLocales.add(sourceLocale); }
From source file:com.globalsight.everest.tm.importer.TmxReaderThread.java
License:Apache License
public void run() { try {// www. ja v a 2 s . c o m SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // Read the DTD and validate. reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_tmxVersion = element.attributeValue(Tmx.VERSION); } public void onEnd(ElementPath path) { } }); reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element.detach(); m_tmx = new Tmx(element); m_tmx.setTmxVersion(m_tmxVersion); m_defaultSrcLang = ImportUtil.normalizeLocale(m_tmx.getSourceLang()); } }); // enable pruning to call me back as each Element is complete reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { m_count++; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element.detach(); m_result = m_results.hireResult(); try { // Normalize spelling of locales. normalizeTu(element); // Filter out targets not to be imported. filterTu(element); // Validate we have source and target. validateTu(element); // Create TU objects SegmentTmTu tu = createTu(element); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug(tu.toDebugString(true)); } m_result.setResultObject(tu); } catch (Throwable ex) { String msg = "Entry " + m_count + ": " + ex.getMessage(); m_result.setError(msg); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug(msg, ex); } else { CATEGORY.warn(msg); } } boolean done = m_results.put(m_result); m_result = null; // Stop reading the TMX file. if (done) { throw new ThreadDeath(); } } }); String url = m_options.getFileName(); Document document = reader.read(url); } catch (ThreadDeath ignore) { CATEGORY.info("ReaderThread: interrupted."); } catch (Throwable ignore) { // Should never happen, and I don't know how to handle // this case other than passing the exception in // m_results, which I won't do for now. CATEGORY.error("unexpected error", ignore); } finally { if (m_result != null) { m_results.fireResult(m_result); } m_results.producerDone(); m_results = null; CATEGORY.debug("ReaderThread: done."); } }
From source file:com.globalsight.everest.tm.util.TmxAnalyzer.java
License:Apache License
public void analyze(String p_url) throws Exception { m_tuCount = 0;//from w w w.j a v a 2s.co m m_tuvCount = 0; m_localeCount = 0; m_locales = new HashSet(); m_tmxVersion = ""; m_tmx = null; SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Analyzing document: " + p_url); reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_tmxVersion = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element.detach(); m_tmx = new Tmx(element); m_tmx.setTmxVersion(m_tmxVersion); } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_tuCount; if (m_tuCount % 1000 == 0) { log("TU " + m_tuCount); } } public void onEnd(ElementPath path) { Element element = path.getCurrent(); List tuvs = element.selectNodes("//tuv"); m_tuvCount += tuvs.size(); for (int i = 0, max = tuvs.size(); i < max; i++) { Element tuv = (Element) tuvs.get(i); String locale = tuv.attributeValue("lang"); m_locales.add(locale); } // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); m_localeCount = m_locales.size(); log("File: " + p_url); log("TMX version: " + m_tmxVersion); log("Total TUs: " + m_tuCount); log("Total TUVs: " + m_tuvCount); log("Total Locales: " + m_localeCount); for (Iterator it = m_locales.iterator(); it.hasNext();) { String locale = (String) it.next(); log(locale); } // all done }
From source file:com.globalsight.everest.tm.util.TmxLevelSplitter.java
License:Apache License
public void split(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); m_entryCount = 0;//from www . j a va 2 s . c o m SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Splitting document `" + p_url + "'"); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = element; try { startFiles(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (containsTags(element)) { writeTagsEntry(element.asXML()); m_tagsCount++; } else { writeTextEntry(element.asXML()); m_textCount++; } // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); closeFiles(); log("Processed " + m_entryCount + " TUs, " + m_textCount + " level 1 (text), " + m_tagsCount + " level 2 (tags)"); // all done }
From source file:com.globalsight.everest.tm.util.TmxSplitter.java
License:Apache License
public void split(String p_url, String p_numEntries) throws Exception { final int maxEntries = Integer.parseInt(p_numEntries); final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); m_entryCount = 0;/* ww w. ja va 2 s . c om*/ SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); log("Splitting document `" + p_url + "'"); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); m_header = element; try { startFile(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; if (m_entryCount % maxEntries == 0) { try { closeFile(); startFile(baseName, extension); } catch (Exception ex) { log(ex.toString()); System.exit(1); } } } public void onEnd(ElementPath path) { Element element = path.getCurrent(); writeEntry(element.asXML()); // prune the current element to reduce memory element.detach(); element = null; } }); Document document = reader.read(p_url); closeFile(); // all done }
From source file:com.globalsight.everest.tm.util.trados.TradosFmSgmlTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *//*from ww w . j a v a 2 s . co m*/ public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { final public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } final public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { final public void onStart(ElementPath path) { } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { final public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { final public void onStart(ElementPath path) { } final public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { String gxml = handleTuv(element); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs " + "into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }