Example usage for org.dom4j.io SAXReader read

List of usage examples for org.dom4j.io SAXReader read

Introduction

In this page you can find the example usage for org.dom4j.io SAXReader read.

Prototype

public Document read(InputSource in) throws DocumentException 

Source Link

Document

Reads a Document from the given InputSource using SAX

Usage

From source file:com.globalsight.everest.segmentationhelper.XmlLoader.java

License:Apache License

/**
 * Transfer xml file into Document.//  w  w  w.  ja  v  a  2s.  c  o m
 * 
 * @param file
 */
private static Document parserWithSAX(File file) throws Exception {
    SAXReader xmlReader = new SAXReader();
    Document doc = null;
    try {
        doc = xmlReader.read(file);
    } catch (Exception e) {
        e.printStackTrace();
        throw new Exception(e.getMessage());
    }

    return doc;
}

From source file:com.globalsight.everest.segmentationhelper.XmlLoader.java

License:Apache License

/**
 * Transfer xml text into Document./*from  w ww  .  ja  v  a 2 s .c  om*/
 * 
 * @param xmltext
 */
private static Document parseWithSAX(String xmltext) throws Exception {
    Document doc = null;
    StringReader sr = new StringReader(xmltext);
    SAXReader xmlReader = new SAXReader();
    try {
        doc = xmlReader.read(sr);
    } catch (Exception e) {
        e.printStackTrace();
        throw new Exception(e.getMessage());
    }
    return doc;
}

From source file:com.globalsight.everest.tm.importer.ImportUtil.java

License:Apache License

/**
 * Saves a TM file with sample validation.
 * //from   ww w .  j a v a 2  s.  com
 * For some TM files, it vary easy to happen encoding error or xml role
 * error and can't be import correct. This method try to do some sample
 * validations for each tu. If a tu will be give up if inducing a error.
 * 
 * @param fileName
 * @throws Exception
 */
public void saveTmFileWithValidation(File file, File newFile, TmProcessStatus status) throws Exception {
    String encoding = "UTF-8";
    String outEncoding = "UTF-8";
    String logEncoding = "Unicode";
    String strLine = System.getProperty("line.separator");

    int errorCount = 0;
    int totalCount = 0;
    long lineCounter = 0;

    String s = null;

    try {
        if (file.exists()) {
            CATEGORY.info("Validating TM file: " + newFile.getAbsolutePath());

            Date startTime = new Date();

            File errorFile = getErrorFile(newFile);
            File infoFile = getInfoFile(newFile);
            File logFile = getLogFile(newFile);

            encoding = getEncodingOfXml(file);
            // GBS-2932 : UTF-8 by default
            if (encoding == null) {
                encoding = "UTF-8";
            }

            // Initialize IO.
            FileInputStream fIn = new FileInputStream(file);
            BufferedReader in = new BufferedReader(new InputStreamReader(fIn, encoding));
            FileOutputStream fOut = new FileOutputStream(newFile);
            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fOut, outEncoding));
            FileOutputStream fError = new FileOutputStream(errorFile);
            OutputStreamWriter error = new OutputStreamWriter(fError, logEncoding);
            FileOutputStream fInfo = new FileOutputStream(infoFile);
            BufferedWriter info = new BufferedWriter(new OutputStreamWriter(fInfo, logEncoding));
            FileOutputStream fLog = new FileOutputStream(logFile);
            OutputStreamWriter log = new OutputStreamWriter(fLog, logEncoding);

            writeHead(error);
            writeHead(log);

            StringBuilder sb = new StringBuilder();

            // It must be <?xml ...
            s = in.readLine();
            s = changeXmlEncodingDec(s, outEncoding);

            status.addSize(s.getBytes(encoding).length);
            if (CATEGORY.isDebugEnabled()) {
                CATEGORY.debug("The content of in.readLine for encoding is " + s);
            }
            sb.append(s);
            sb.append(strLine);

            status.addSize(s.getBytes(encoding).length);

            // If the second line is define dtd
            s = in.readLine();
            if (CATEGORY.isDebugEnabled()) {
                CATEGORY.debug("The content of in.readLine for doctype is " + s);
            }
            if (s != null && s.indexOf("<!DOCTYPE") > -1) {
                status.addSize(s.getBytes(encoding).length);

                sb.append(s);
                sb.append(strLine);
                s = in.readLine();
                if (CATEGORY.isDebugEnabled()) {
                    CATEGORY.debug("The content of in.readLine is " + s);
                }
            } else if (newFile.getName().endsWith("tmx")) {
                // Don't define the dtd, add it.
                sb.append(TMX_DTD_LINE);
                sb.append(strLine);
            }
            boolean isRemoved = false;
            int count = 0;
            SAXReader reader = new SAXReader();
            while (s != null) {
                if (status.isCanceled()) {
                    CATEGORY.info("Cancelled validating");
                    break;
                }
                status.addSize(s.getBytes(encoding).length);

                if (isHeaderStart(s) && isTradosFontTableStart(s) && isHeaderEnd(s)) {
                    int headerEndTag = s.indexOf(">");
                    sb.append(s.subSequence(0, headerEndTag + 1));
                    int endHeaderTag = s.indexOf("</header>");
                    sb.append(s.substring(endHeaderTag));
                    sb.append(endHeaderTag);
                    sb.append(strLine);
                }
                if (isRemoved) {
                    if (isTradosFontTableEnd(s)) {
                        isRemoved = false;
                    }
                    s = in.readLine();
                    continue;
                }
                if (isTuStartTag(s)) {
                    /* The begin of the tu */
                    // Saves information recoded.
                    if (sb.length() > 0) {
                        out.write(sb.toString());
                        out.flush();
                    }

                    sb = resetStringBuilder(sb);
                    sb.append(s);
                    sb.append(strLine);

                    totalCount++;
                }

                // Validate for the tu.
                else if (isTuEndTag(s)) {
                    /* The end of the tu */
                    sb.append(s);
                    sb.append(strLine);
                    String content = sb.toString();

                    try {
                        /* verify the content */
                        reader.read(new StringReader(content));

                        // Saves the tu if no exception happen.
                        out.write(content);
                        out.flush();
                    } catch (Exception e) {
                        // Give up the tu if any exception happened.
                        error.write(content);

                        log.write(strLine);
                        log.write(SPLIT_LINE);
                        log.write(Integer.toString(++errorCount));
                        log.write(SPLIT_LINE);
                        log.write(strLine);

                        log.write(content);
                        log.write(strLine);
                        log.write(e.getMessage());
                        log.write(strLine);

                    }

                    sb = resetStringBuilder(sb);
                } else if (isTradosFontTableStart(s)) {
                    count++;
                    isRemoved = true;
                } else if (count > 0 && isTradosFontTableEnd(s)) {
                    isRemoved = false;
                } else if ((count > 0) && isHeaderEnd(s)) {
                    sb.append("</header>");
                    sb.append(strLine);
                } else {
                    // Records informations which not included in tu, first
                    // line
                    // etc.
                    sb.append(s);
                    sb.append(strLine);
                }

                s = in.readLine();
                if (CATEGORY.isDebugEnabled()) {
                    CATEGORY.debug("The content of in.readLine is " + s);
                }
                lineCounter++;
            }

            // Records informations which not included in tu and not saved
            // to
            // file. Usually it is "</body> </tmx>".
            if (sb.length() > 0) {
                out.write(sb.toString());
                out.flush();
            }

            in.close();

            out.close();

            CATEGORY.info("Done validating");

            log.write(SPLIT_LINE + SPLIT_LINE + strLine + strLine);
            log.write("Error: " + errorCount + strLine);
            log.write("Total: " + totalCount + strLine);

            // Gets the cost time.
            Date endTime = new Date();
            long costTime = endTime.getTime() - startTime.getTime();
            long h = costTime / (1000 * 60 * 60);
            costTime = costTime % (1000 * 60 * 60);
            long m = costTime / (1000 * 60);
            costTime = costTime % (1000 * 60);
            long se = costTime / 1000;
            StringBuffer time = new StringBuffer("Cost time: ");
            time.append(h).append(" h ").append(m).append(" m ").append(se).append(" s ");

            // Recodes some sample informations.
            String msg = "Error: " + errorCount + strLine;
            info.write(msg);
            msg = "Total: " + totalCount + strLine;
            info.write(msg);
            info.write(time.toString());

            writeFoot(error);
            writeFoot(log);

            error.flush();
            error.close();
            info.flush();
            info.close();
            log.flush();
            log.close();

            if (lineCounter > 10000) {
                CATEGORY.debug("forces jvm to perform gc when the line count reaches 10000. line count: "
                        + lineCounter);
                System.gc();
            }
        }
    } catch (IOException ie) {
        CATEGORY.error("IO Exception occured when save the tm file.");
        CATEGORY.error("The content of current line is " + s);
        CATEGORY.error("The stacktrace of the exception is ", ie);
        throw ie;
    } catch (Exception e) {
        CATEGORY.error("error occured when save the tm file.");
        CATEGORY.error("The content of current line is " + s);
        CATEGORY.error("The stacktrace of the exception is ", e);
        throw e;
    }

    status.setErrorTus(Integer.toString(errorCount));
    status.setTotalTus(Integer.toString(totalCount));
}

From source file:com.globalsight.everest.tm.importer.ImportUtil.java

License:Apache License

/**
 * Saves a TM file with sample validation.
 * /*from   w  w w .  j a va  2s .c o  m*/
 * For some TM files, it vary easy to happen encoding error or xml role
 * error and can't be import correct. This method try to do some sample
 * validations for each tu. If a tu will be give up if inducing a error.
 * 
 * This is used for no UI display requirement.
 * 
 * @param fileName
 * @throws Exception
 */
public void saveTmFileWithValidation(File file, File newFile) throws Exception {
    String encoding = "UTF-8";
    String outEncoding = "UTF-8";
    String logEncoding = "Unicode";
    String strLine = System.getProperty("line.separator");

    int errorCount = 0;
    int totalCount = 0;
    long lineCounter = 0;

    String s = null;

    try {
        if (file.exists()) {
            CATEGORY.info("Validating TM file: " + newFile.getAbsolutePath());

            Date startTime = new Date();

            File errorFile = getErrorFile(newFile);
            File infoFile = getInfoFile(newFile);
            File logFile = getLogFile(newFile);

            encoding = getEncodingOfXml(file);
            // GBS-2932 : UTF-8 by default
            if (encoding == null) {
                encoding = "UTF-8";
            }

            // Initialize IO.
            FileInputStream fIn = new FileInputStream(file);
            BufferedReader in = new BufferedReader(new InputStreamReader(fIn, encoding));
            FileOutputStream fOut = new FileOutputStream(newFile);
            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fOut, outEncoding));
            FileOutputStream fError = new FileOutputStream(errorFile);
            OutputStreamWriter error = new OutputStreamWriter(fError, logEncoding);
            FileOutputStream fInfo = new FileOutputStream(infoFile);
            BufferedWriter info = new BufferedWriter(new OutputStreamWriter(fInfo, logEncoding));
            FileOutputStream fLog = new FileOutputStream(logFile);
            OutputStreamWriter log = new OutputStreamWriter(fLog, logEncoding);

            writeHead(error);
            writeHead(log);

            StringBuilder sb = new StringBuilder();

            // It must be <?xml ...
            s = in.readLine();
            s = changeXmlEncodingDec(s, outEncoding);

            if (CATEGORY.isDebugEnabled()) {
                CATEGORY.debug("The content of in.readLine for encoding is " + s);
            }
            sb.append(s);
            sb.append(strLine);

            // If the second line is define dtd
            s = in.readLine();
            if (CATEGORY.isDebugEnabled()) {
                CATEGORY.debug("The content of in.readLine for doctype is " + s);
            }
            if (s != null && s.indexOf("<!DOCTYPE") > -1) {
                sb.append(s);
                sb.append(strLine);
                s = in.readLine();
                if (CATEGORY.isDebugEnabled()) {
                    CATEGORY.debug("The content of in.readLine is " + s);
                }
            } else if (newFile.getName().endsWith("tmx")) {
                // Don't define the dtd, add it.
                sb.append(TMX_DTD_LINE);
                sb.append(strLine);
            }
            boolean isRemoved = false;
            int count = 0;
            SAXReader reader = new SAXReader();
            while (s != null) {
                if (isHeaderStart(s) && isTradosFontTableStart(s) && isHeaderEnd(s)) {
                    int headerEndTag = s.indexOf(">");
                    sb.append(s.subSequence(0, headerEndTag + 1));
                    int endHeaderTag = s.indexOf("</header>");
                    sb.append(s.substring(endHeaderTag));
                    sb.append(endHeaderTag);
                    sb.append(strLine);
                }
                if (isRemoved) {
                    if (isTradosFontTableEnd(s)) {
                        isRemoved = false;
                    }
                    s = in.readLine();
                    continue;
                }
                if (isTuStartTag(s)) {
                    /* The begin of the tu */
                    // Saves information recoded.
                    if (sb.length() > 0) {
                        out.write(sb.toString());
                        out.flush();
                    }

                    sb = resetStringBuilder(sb);
                    sb.append(s);
                    sb.append(strLine);

                    totalCount++;
                }

                // Validate for the tu.
                else if (isTuEndTag(s)) {
                    /* The end of the tu */
                    sb.append(s);
                    sb.append(strLine);
                    String content = sb.toString();

                    try {
                        /* verify the content */
                        reader.read(new StringReader(content));

                        // Saves the tu if no exception happen.
                        out.write(content);
                        out.flush();
                    } catch (Exception e) {
                        // Give up the tu if any exception happened.
                        error.write(content);

                        log.write(strLine);
                        log.write(SPLIT_LINE);
                        log.write(Integer.toString(++errorCount));
                        log.write(SPLIT_LINE);
                        log.write(strLine);

                        log.write(content);
                        log.write(strLine);
                        log.write(e.getMessage());
                        log.write(strLine);

                    }

                    sb = resetStringBuilder(sb);
                } else if (isTradosFontTableStart(s)) {
                    count++;
                    isRemoved = true;
                } else if (count > 0 && isTradosFontTableEnd(s)) {
                    isRemoved = false;
                } else if ((count > 0) && isHeaderEnd(s)) {
                    sb.append("</header>");
                    sb.append(strLine);
                } else {
                    // Records informations which not included in tu, first
                    // line
                    // etc.
                    sb.append(s);
                    sb.append(strLine);
                }

                s = in.readLine();
                if (CATEGORY.isDebugEnabled()) {
                    CATEGORY.debug("The content of in.readLine is " + s);
                }
                lineCounter++;
            }

            // Records informations which not included in tu and not saved
            // to
            // file. Usually it is "</body> </tmx>".
            if (sb.length() > 0) {
                out.write(sb.toString());
                out.flush();
            }

            in.close();

            out.close();

            CATEGORY.info("Done validating");

            log.write(SPLIT_LINE + SPLIT_LINE + strLine + strLine);
            log.write("Error: " + errorCount + strLine);
            log.write("Total: " + totalCount + strLine);

            // Gets the cost time.
            Date endTime = new Date();
            long costTime = endTime.getTime() - startTime.getTime();
            long h = costTime / (1000 * 60 * 60);
            costTime = costTime % (1000 * 60 * 60);
            long m = costTime / (1000 * 60);
            costTime = costTime % (1000 * 60);
            long se = costTime / 1000;
            StringBuffer time = new StringBuffer("Cost time: ");
            time.append(h).append(" h ").append(m).append(" m ").append(se).append(" s ");

            // Recodes some sample informations.
            String msg = "Error: " + errorCount + strLine;
            info.write(msg);
            msg = "Total: " + totalCount + strLine;
            info.write(msg);
            info.write(time.toString());

            writeFoot(error);
            writeFoot(log);

            error.flush();
            error.close();
            info.flush();
            info.close();
            log.flush();
            log.close();

            if (lineCounter > 10000) {
                CATEGORY.debug("suggests jvm to perform gc when the line count reaches 10000. line count: "
                        + lineCounter);
                System.gc();
            }
        }
    } catch (IOException ie) {
        CATEGORY.error("IO Exception occured when save the tm file.");
        CATEGORY.error("The content of current line is " + s);
        CATEGORY.error("The stacktrace of the exception is ", ie);
        throw ie;
    } catch (Exception e) {
        CATEGORY.error("error occured when save the tm file.");
        CATEGORY.error("The content of current line is " + s);
        CATEGORY.error("The stacktrace of the exception is ", e);
        throw e;
    }
}

From source file:com.globalsight.everest.tm.importer.TmxReader.java

License:Apache License

/**
 * Reads an XML file and checks its correctness by validating
 * against the TMX DTD. If there's any error in the file, an
 * exception is thrown.//from w  w w . j  a v a2s  .c  o  m
 *
 * As a side effect, this method builds a list of source and
 * target locales found in the file, including the declared source
 * locale from the header.
 */
private void analyzeXml(String p_url) throws Exception {
    if (m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_RTF || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_HTML
            || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_FM || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_FM_SGML
            || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_IL
            || m_tmxLevel == ImportUtil.TMX_LEVEL_TRADOS_XPTAG) {
        // Convert the Trados codes to native System4 codes by
        // converting the file to RTF, saving it as HTML and
        // extracting the resulting TUVs.

        CATEGORY.info("Converting Trados TMX to native TMX: " + p_url);

        p_url = convertTradosTmx(p_url, m_tmxLevel);

        // Now we have a new file that contains native content.
        m_options.setFileName(p_url);
        m_options.setFileType(com.globalsight.everest.tm.importer.ImportOptions.TYPE_XML);

        m_tmxLevel = ImportUtil.TMX_LEVEL_NATIVE;
    }

    CATEGORY.debug("Analyzing document: " + p_url);

    // Reset list of locales found in the file.
    m_sourceLocales = new HashSet();
    m_targetLocales = new HashSet();

    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    // Read the DTD and validate.
    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_tmxVersion = element.attributeValue(Tmx.VERSION);
        }

        public void onEnd(ElementPath path) {
        }
    });

    reader.addHandler("/tmx/header", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            m_header = new Tmx(element);
            m_header.setTmxVersion(m_tmxVersion);

            element.detach();
        }
    });

    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            // Record optional source language declared on TU.
            String srclang = element.attributeValue(Tmx.SRCLANG);
            if (srclang != null) {
                m_sourceLocales.add(ImportUtil.normalizeLocale(srclang));
            }

            // Find target languages
            HashSet langs = new HashSet();
            List tuvs = element.selectNodes("./tuv");

            for (int i = 0, max = tuvs.size(); i < max; i++) {
                Element tuv = (Element) tuvs.get(i);

                String lang = tuv.attributeValue(Tmx.LANG);

                // Collect TUV locales
                langs.add(ImportUtil.normalizeLocale(lang));
            }

            langs.remove(srclang);
            m_targetLocales.addAll(langs);

            // prune the current element to reduce memory
            element.detach();
        }
    });

    Document document = reader.read(p_url);

    // Add declared source language from header.
    String sourceLocale = ImportUtil.normalizeLocale(m_header.getSourceLang());

    m_sourceLocales.add(sourceLocale);
}

From source file:com.globalsight.everest.tm.importer.TmxReaderThread.java

License:Apache License

public void run() {
    try {//  www.  ja  v  a  2 s .  c o  m
        SAXReader reader = new SAXReader();
        reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

        // Read the DTD and validate.
        reader.setEntityResolver(DtdResolver.getInstance());
        reader.setValidation(true);

        reader.addHandler("/tmx", new ElementHandler() {
            public void onStart(ElementPath path) {
                Element element = path.getCurrent();

                m_tmxVersion = element.attributeValue(Tmx.VERSION);
            }

            public void onEnd(ElementPath path) {
            }
        });

        reader.addHandler("/tmx/header", new ElementHandler() {
            public void onStart(ElementPath path) {
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();
                element.detach();

                m_tmx = new Tmx(element);
                m_tmx.setTmxVersion(m_tmxVersion);

                m_defaultSrcLang = ImportUtil.normalizeLocale(m_tmx.getSourceLang());
            }
        });

        // enable pruning to call me back as each Element is complete
        reader.addHandler("/tmx/body/tu", new ElementHandler() {
            public void onStart(ElementPath path) {
                m_count++;
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();
                element.detach();

                m_result = m_results.hireResult();

                try {
                    // Normalize spelling of locales.
                    normalizeTu(element);
                    // Filter out targets not to be imported.
                    filterTu(element);
                    // Validate we have source and target.
                    validateTu(element);

                    // Create TU objects
                    SegmentTmTu tu = createTu(element);

                    if (CATEGORY.isDebugEnabled()) {
                        CATEGORY.debug(tu.toDebugString(true));
                    }

                    m_result.setResultObject(tu);
                } catch (Throwable ex) {
                    String msg = "Entry " + m_count + ": " + ex.getMessage();

                    m_result.setError(msg);

                    if (CATEGORY.isDebugEnabled()) {
                        CATEGORY.debug(msg, ex);
                    } else {
                        CATEGORY.warn(msg);
                    }
                }

                boolean done = m_results.put(m_result);
                m_result = null;

                // Stop reading the TMX file.
                if (done) {
                    throw new ThreadDeath();
                }
            }
        });

        String url = m_options.getFileName();

        Document document = reader.read(url);
    } catch (ThreadDeath ignore) {
        CATEGORY.info("ReaderThread: interrupted.");
    } catch (Throwable ignore) {
        // Should never happen, and I don't know how to handle
        // this case other than passing the exception in
        // m_results, which I won't do for now.
        CATEGORY.error("unexpected error", ignore);
    } finally {
        if (m_result != null) {
            m_results.fireResult(m_result);
        }

        m_results.producerDone();
        m_results = null;

        CATEGORY.debug("ReaderThread: done.");
    }
}

From source file:com.globalsight.everest.tm.util.TmxAnalyzer.java

License:Apache License

public void analyze(String p_url) throws Exception {
    m_tuCount = 0;//from  w  w  w.j a  v a 2s.co m
    m_tuvCount = 0;
    m_localeCount = 0;
    m_locales = new HashSet();
    m_tmxVersion = "";
    m_tmx = null;

    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    log("Analyzing document: " + p_url);

    reader.addHandler("/tmx", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_tmxVersion = element.attributeValue("version");
        }

        public void onEnd(ElementPath path) {
        }
    });

    reader.addHandler("/tmx/header", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            element.detach();

            m_tmx = new Tmx(element);
            m_tmx.setTmxVersion(m_tmxVersion);
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_tuCount;

            if (m_tuCount % 1000 == 0) {
                log("TU " + m_tuCount);
            }
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            List tuvs = element.selectNodes("//tuv");

            m_tuvCount += tuvs.size();

            for (int i = 0, max = tuvs.size(); i < max; i++) {
                Element tuv = (Element) tuvs.get(i);

                String locale = tuv.attributeValue("lang");
                m_locales.add(locale);
            }

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    Document document = reader.read(p_url);

    m_localeCount = m_locales.size();

    log("File: " + p_url);
    log("TMX version: " + m_tmxVersion);
    log("Total TUs: " + m_tuCount);
    log("Total TUVs: " + m_tuvCount);
    log("Total Locales: " + m_localeCount);

    for (Iterator it = m_locales.iterator(); it.hasNext();) {
        String locale = (String) it.next();

        log(locale);
    }

    // all done
}

From source file:com.globalsight.everest.tm.util.TmxLevelSplitter.java

License:Apache License

public void split(String p_url) throws Exception {
    final String baseName = getBaseName(p_url);
    final String extension = getExtension(p_url);

    m_entryCount = 0;//from  www . j a va 2  s .  c  o  m

    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    log("Splitting document `" + p_url + "'");

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue("version");
        }

        public void onEnd(ElementPath path) {
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/header", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            m_header = element;

            try {
                startFiles(baseName, extension);
            } catch (Exception ex) {
                log(ex.toString());
                System.exit(1);
            }

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            if (containsTags(element)) {
                writeTagsEntry(element.asXML());

                m_tagsCount++;
            } else {
                writeTextEntry(element.asXML());

                m_textCount++;
            }

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    Document document = reader.read(p_url);

    closeFiles();

    log("Processed " + m_entryCount + " TUs, " + m_textCount + " level 1 (text), " + m_tagsCount
            + " level 2 (tags)");

    // all done
}

From source file:com.globalsight.everest.tm.util.TmxSplitter.java

License:Apache License

public void split(String p_url, String p_numEntries) throws Exception {
    final int maxEntries = Integer.parseInt(p_numEntries);
    final String baseName = getBaseName(p_url);
    final String extension = getExtension(p_url);

    m_entryCount = 0;/*  ww w. ja  va  2  s  . c  om*/

    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    log("Splitting document `" + p_url + "'");

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue("version");
        }

        public void onEnd(ElementPath path) {
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/header", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            m_header = element;

            try {
                startFile(baseName, extension);
            } catch (Exception ex) {
                log(ex.toString());
                System.exit(1);
            }

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;

            if (m_entryCount % maxEntries == 0) {
                try {
                    closeFile();
                    startFile(baseName, extension);
                } catch (Exception ex) {
                    log(ex.toString());
                    System.exit(1);
                }
            }
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            writeEntry(element.asXML());

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    Document document = reader.read(p_url);

    closeFile();

    // all done
}

From source file:com.globalsight.everest.tm.util.trados.TradosFmSgmlTmxToGxml.java

License:Apache License

/**
 * Main method to call, returns the new filename of the result.
 *//*from ww w  . j  a v a  2 s . co m*/
public String convertToGxml(String p_url) throws Exception {
    final String baseName = getBaseName(p_url);
    final String extension = getExtension(p_url);

    info("Converting TMX file to GXML: `" + p_url + "'");
    startOutputFile(baseName);

    m_entryCount = 0;

    // Reading from a file, need to use Xerces.
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");
    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx", new ElementHandler() {
        final public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue("version");
        }

        final public void onEnd(ElementPath path) {
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/header", new ElementHandler() {
        final public void onStart(ElementPath path) {
        }

        final public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            setOldHeader(element);
            createNewHeader();

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        final public void onStart(ElementPath path) {
            ++m_entryCount;
            m_tuError = false;
        }

        final public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            if (m_tuError) {
                m_errorCount++;
            } else {
                writeEntry(element.asXML());
            }

            // prune the current element to reduce memory
            element.detach();

            element = null;

            if (m_entryCount % 1000 == 0) {
                debug("Entry " + m_entryCount);
            }
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() {
        final public void onStart(ElementPath path) {
        }

        final public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            try {
                String gxml = handleTuv(element);
                Document doc = parse("<root>" + gxml + "</root>");

                // Remove old content of seg
                List content = element.content();
                for (int i = content.size() - 1; i >= 0; --i) {
                    ((Node) content.get(i)).detach();
                }

                // Add new GXML content (backwards)
                content = doc.getRootElement().content();
                Collections.reverse(content);
                for (int i = content.size() - 1; i >= 0; --i) {
                    Node node = (Node) content.get(i);
                    element.add(node.detach());
                }
            } catch (Throwable ex) {
                m_tuError = true;
            }
        }
    });

    Document document = reader.read(p_url);

    closeOutputFile();

    info("Processed " + m_entryCount + " TUs " + "into file `" + m_filename + "', " + m_errorCount
            + " errors.");

    return m_filename;
}