Example usage for org.apache.commons.configuration BaseConfiguration BaseConfiguration

List of usage examples for org.apache.commons.configuration BaseConfiguration BaseConfiguration

Introduction

In this page you can find the example usage for org.apache.commons.configuration BaseConfiguration BaseConfiguration.

Prototype

BaseConfiguration

Source Link

Usage

From source file:revaligner.service.FileAligner.java

public String[] createAlignedXML() throws Exception {
    System.out.println("creating aligned xml....");

    String[] res = new String[1];

    this.txlftrgsegmap = new LinkedHashMap();
    this.alignedfile = (this.prjfolder + File.separator + "rev_aligned.xml");
    this.reformattedtargetmapfile = (this.prjfolder + File.separator + "target_reformatted" + File.separator
            + ".mp");
    StringBuffer sbmp = new StringBuffer();
    if (new File(this.alignedfile).exists()) {
        new File(this.alignedfile).delete();
    }// w ww. j a  va2s .co  m
    SegmenterFactory factory = new SegmenterFactory();
    Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage);
    Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig);

    org.dom4j.Document document = DocumentHelper.createDocument();
    org.dom4j.Element root = document.addElement("alinger");
    org.dom4j.Element head = root.addElement("head");
    head.addAttribute("src_lang", this.sourcelanguage);
    head.addAttribute("trg_lang", this.targetlanguage);
    head.addAttribute("creator", this.creatorid);
    org.dom4j.Element aligned = root.addElement("aligned");
    org.dom4j.Element orphans = root.addElement("orphans");

    org.dom4j.Document document_source_formatted_nonSeg = XmlParser
            .parseXmlFile(this.reformattedsourcetxlf_nonSeg);
    org.dom4j.Element root_source_formatted_nonSeg = document_source_formatted_nonSeg.getRootElement();
    List list_source_formatted_nonSeg = root_source_formatted_nonSeg.selectNodes("//*[name() = 'trans-unit']");

    org.dom4j.Document document_target_nonSeg = XmlParser.parseXmlFile(this.reformattedtargettxlf_nonSeg);
    org.dom4j.Element root_target_nonSeg = document_target_nonSeg.getRootElement();

    List list_target_nonSeg = root_target_nonSeg.selectNodes("//*[name() = 'trans-unit']");

    org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg);
    org.dom4j.Element root_target_seg = document_target_seg.getRootElement();

    List list_target_seg = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']");
    int trg_para_count = 0;

    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    Workbook wb = new Workbook();
    Cells cells = wb.getWorksheets().get(0).getCells();
    int cnt = 0;

    boolean issrcfirsthf = true;
    boolean istrgfirsthf = true;
    int gcount = -1;
    for (int i = 0; i < list_source_formatted_nonSeg.size(); i++) {
        org.dom4j.Element src_txlf = ((org.dom4j.Element) list_source_formatted_nonSeg.get(i))
                .element("source");
        String merged_text = getTxlfElementText_withFakeTC(src_txlf);
        if (extractionSupportImpl
                .isExtractable(merged_text.replace("&amp;paradel;", "").replace("&amp;parains;", ""))) {
            gcount++;

            org.dom4j.Element group = aligned.addElement("group");
            group.addAttribute("id", Integer.toString(gcount));

            merged_text = trimText(merged_text, true)[0];

            org.dom4j.Element merged_src_text = group.addElement("text");

            merged_src_text.setText(merged_text.replace("&amp;paradel;", "").replace("&amp;parains;", ""));

            String[] split_merged_text = merged_text.replaceAll("(&amp;paradel;)+", "&amp;paradel;")
                    .replaceAll("^&amp;paradel;", "").replaceAll("&amp;paradel;$", "").split("&amp;paradel;");
            List<String> segmentsGroup = segmentStringWithRevs(
                    merged_text.replaceAll("(&amp;paradel;)+", "&amp;paradel;").replace("&amp;parains;", ""),
                    this.sourcelanguage);

            List<List<String>> resegmentedGroup = new ArrayList();
            resegmentedGroup.add(new ArrayList());
            int idx = 0;
            String orgs;
            String[] newsegs;
            for (int s = 0; s < segmentsGroup.size(); s++) {
                orgs = (String) segmentsGroup.get(s);
                if (orgs.contains("&amp;paradel;")) {
                    newsegs = orgs.split("&amp;paradel;");
                    for (int ss = 0; ss < newsegs.length; ss++) {
                        String sss = newsegs[ss];
                        if (!sss.trim().equals("")) {
                            ((List) resegmentedGroup.get(idx)).add(fixMissingTags(sss));
                        }
                        if ((((List) resegmentedGroup.get(idx)).size() != 0) && (ss != newsegs.length - 1)) {
                            resegmentedGroup.add(new ArrayList());
                            idx++;
                        }
                    }
                    if (orgs.trim().endsWith("&amp;paradel;")) {
                        resegmentedGroup.add(new ArrayList());
                        idx++;
                    }
                } else {
                    ((List) resegmentedGroup.get(idx)).add(fixMissingTags(orgs));
                }
            }
            if (split_merged_text.length > resegmentedGroup.size()) {
                System.out.println(i);
                System.out.println("merged_text: " + merged_text);
                for (String smt : split_merged_text) {
                    System.out.println("split_merged_text: " + smt);
                }
                for (List<String> smts : resegmentedGroup) {
                    System.out.println("resegmentedGroup: " + smts);
                }
                for (String smtss : segmentsGroup) {
                    System.out.println("segmentedGroup: " + smtss);
                }
            }
            for (int j = 0; j < split_merged_text.length; j++) {
                if (!split_merged_text[j].replaceAll("<(/)*ins>|<(/)*del>", "").trim().equals("")) {
                    split_merged_text[j] = fixMissingTags(split_merged_text[j]);

                    org.dom4j.Element unit = group.addElement("unit");
                    unit.addAttribute("id", Integer.toString(j));
                    unit.addAttribute("alignsegs", "false");

                    org.dom4j.Element src = unit.addElement("src_para");
                    org.dom4j.Element src_text = src.addElement("text");
                    boolean ishf = split_merged_text[j].contains("&amp;hf;");
                    if (!ishf) {
                        issrcfirsthf = false;
                    }
                    boolean isAddedPara = split_merged_text[j].contains("&amp;parains;");
                    src.addAttribute("added", "" + isAddedPara);
                    String[] trim_result = trimText(
                            split_merged_text[j].replace("&amp;parains;", "").replace("&amp;hf;", ""), false);
                    src.addAttribute("lefttrim", trim_result[1]);
                    src.addAttribute("righttrim", trim_result[2]);
                    split_merged_text[j] = trim_result[0];

                    int src_tctype_para = TrackChangeHelper.getTrackChangeType(split_merged_text[j]);
                    src.addAttribute("tctype", TrackChangeType.getName(src_tctype_para));
                    String rejected_src = split_merged_text[j].replaceAll("(?s)<ins>.*?</ins>", "")
                            .replace("<del>", "").replace("</del>", "");
                    if ((!extractionSupportImpl.isExtractable(rejected_src)) || (ishf)) {
                        unit.addAttribute("locked", "true");
                    } else {
                        unit.addAttribute("locked", "false");
                    }
                    src_text.setText(split_merged_text[j]);

                    cells.get(cnt, 0).setHtmlString("<html>" + split_merged_text[j].replace("ins>", "u>")
                            .replace("del>", "strike>").replace("<br> ", "&#8629;<br>") + "</html>");

                    org.dom4j.Element src_segs = src.addElement("segments");
                    List<String> segments = (List) resegmentedGroup.get(j);
                    for (int z = 0; z < segments.size(); z++) {
                        String segment_text = trimText((String) segments.get(z), false)[0];
                        org.dom4j.Element src_seg = src_segs.addElement("src_seg");
                        src_seg.addAttribute("id", Integer.toString(z));
                        src_seg.addAttribute("needreview", "false");
                        src_seg.addAttribute("ignored", "false");
                        int tctype_seg = TrackChangeHelper.getTrackChangeType(segment_text);
                        src_seg.addAttribute("tctype", TrackChangeType.getName(tctype_seg));
                        String accepted_t = segment_text.replaceAll("(?s)<del>.*?</del>", "")
                                .replace("<ins>", "").replace("</ins>", "");
                        src_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl.isExtractable(accepted_t)));

                        String rejected_s = segment_text.replaceAll("(?s)<ins>.*?</ins>", "")
                                .replace("<del>", "").replace("</del>", "");
                        if ((!extractionSupportImpl.isExtractable(rejected_s)) || (ishf)) {
                            src_seg.addAttribute("locked", "true");
                        } else {
                            src_seg.addAttribute("locked", "false");
                        }
                        src_seg.setText(segment_text);
                    }
                    org.dom4j.Element trg = unit.addElement("trg_para");
                    if ((src_tctype_para != 1) && (!isAddedPara) && (!ishf)
                            && (trg_para_count < list_target_nonSeg.size())) {
                        trg.addAttribute("id", Integer.toString(gcount) + " - " + Integer.toString(j));
                        org.dom4j.Element trg_text = trg.addElement("text");
                        org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg
                                .get(trg_para_count)).element("source");
                        org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) list_target_seg
                                .get(trg_para_count);
                        while (trg_txlf.getText().contains("&hf;")) {
                            trg_para_count++;
                            trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(trg_para_count))
                                    .element("source");
                            trg_txlf_seg = (org.dom4j.Element) list_target_seg.get(trg_para_count);
                        }
                        istrgfirsthf = false;

                        String trg_formatted_text = getTxlfElementText_normal(trg_txlf);
                        trg_text.setText(trg_formatted_text.replace("&amp;hf;", ""));

                        cells.get(cnt, 1)
                                .setHtmlString("<html>"
                                        + trg_formatted_text.replace("ins>", "u>").replace("del>", "strike>")
                                                .replace("&amp;hf;", "").replace("<br> ", "&#8629;<br>")
                                        + "</html>");
                        cnt++;

                        org.dom4j.Element trg_segs = trg.addElement("segments");
                        List<String> trgsegs = segmentStringWithRevs(trg_formatted_text, this.targetlanguage);
                        List<org.dom4j.Element> srcsegs = src_segs.elements("src_seg");
                        int trg_tmp_cnt = 0;
                        for (int z = 0; trg_tmp_cnt < trgsegs.size(); z++) {
                            org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg");
                            trg_seg.addAttribute("id", Integer.toString(z));
                            trg_seg.addAttribute("edited", "false");
                            if ((z < srcsegs.size()) && (((org.dom4j.Element) srcsegs.get(z))
                                    .attributeValue("tctype").equals(TrackChangeType.getName(1)))) {
                                trg_seg.addAttribute("isExtractable", "false");
                                trg_seg.setText("");
                            } else {
                                String trgsegtext = ((String) trgsegs.get(trg_tmp_cnt)).replace("&amp;hf;", "")
                                        .trim();
                                trg_seg.addAttribute("isExtractable",
                                        Boolean.toString(extractionSupportImpl.isExtractable(trgsegtext)));
                                trg_seg.setText(trgsegtext);

                                String mapid = Integer.toString(gcount) + " - " + Integer.toString(j) + " - "
                                        + Integer.toString(z);
                                List t = ((org.dom4j.Element) trg_txlf_seg.elements("trans-unit")
                                        .get(trg_tmp_cnt)).content();

                                sbmp.append(mapid + "\t" + trg_para_count + "\t" + trg_tmp_cnt + "\n");

                                trg_tmp_cnt++;
                            }
                        }
                        trg_para_count++;
                    } else {
                        trg.addAttribute("id", Integer.toString(gcount) + " - " + Integer.toString(j));
                        org.dom4j.Element trg_text = trg.addElement("text");
                        trg_text.setText("");
                        trg.addElement("segments");
                        cnt++;
                    }
                    int trgcnt = trg.element("segments").elements("trg_seg").size();
                    int srccnt = src.element("segments").elements("src_seg").size();
                    if (trgcnt < srccnt) {
                        for (int x = 1; x <= srccnt - trgcnt; x++) {
                            org.dom4j.Element trg_seg = trg.element("segments").addElement("trg_seg");
                            trg_seg.addAttribute("id", Integer.toString(trgcnt + x - 1));
                            trg_seg.addAttribute("edited", "false");
                            trg_seg.addAttribute("isExtractable", "false");
                            trg_seg.setText("");
                        }
                    }
                }
            }
        }
    }
    int unitcnt = list_source_formatted_nonSeg.size();
    for (int i = trg_para_count; i < list_target_nonSeg.size(); i++) {
        org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(trg_para_count))
                .element("source");
        org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) list_target_seg.get(trg_para_count);
        if (!trg_txlf.getText().contains("&hf;")) {
            org.dom4j.Element group = aligned.addElement("group");
            group.addAttribute("id", Integer.toString(unitcnt));
            group.addElement("text").setText("");

            org.dom4j.Element unit = group.addElement("unit");
            unit.addAttribute("id", "0");
            unit.addAttribute("alignsegs", "false");
            unit.addAttribute("locked", "false");

            org.dom4j.Element trg = unit.addElement("trg_para");
            trg.addAttribute("id", Integer.toString(unitcnt) + " - 0");
            org.dom4j.Element trg_text = trg.addElement("text");

            String trg_formatted_text = getTxlfElementText_normal(trg_txlf);
            trg_text.setText(trg_formatted_text.replace("&amp;hf;", ""));

            org.dom4j.Element trg_segs = trg.addElement("segments");
            List<String> trgsegs = segmentStringWithRevs(trg_formatted_text.replace("&amp;hf;", ""),
                    this.targetlanguage);
            for (int z = 0; z < trgsegs.size(); z++) {
                org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg");
                trg_seg.addAttribute("id", Integer.toString(z));
                trg_seg.addAttribute("edited", "false");
                trg_seg.addAttribute("isExtractable",
                        Boolean.toString(extractionSupportImpl.isExtractable((String) trgsegs.get(z))));

                trg_seg.setText(((String) trgsegs.get(z)).trim());

                String mapid = Integer.toString(unitcnt) + " - 0 - " + Integer.toString(z);
                List t = ((org.dom4j.Element) trg_txlf_seg.elements("trans-unit").get(z)).content();

                sbmp.append(mapid + "\t" + trg_para_count + "\t" + z + "\n");
            }
            trg_para_count++;
            unitcnt++;
        }
    }
    wb.save(this.prjfolder + File.separator + "verifySegsPop.xlsx");

    OutputStreamWriter writer = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8");
    document.write(writer);
    writer.close();

    OutputStreamWriter writermp = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.reformattedtargetmapfile)), "UTF8");
    writermp.write(sbmp.toString());
    writermp.close();

    return res;
}

From source file:revaligner.service.FileAligner.java

public void createAlignedXML_auto(String prjid) throws Exception {
    System.out.println("creating aligned xml with nbAligner....");

    this.alignedfile = (this.prjfolder + File.separator + "rev_aligned.xml");
    this.reformattedtargetmapfile = (this.prjfolder + File.separator + "target_reformatted" + File.separator
            + ".mp");
    HashMap<String, String> srcidmap = new HashMap();

    this.nbalignerfolder = (this.prjfolder + File.separator + "nbaligner");
    if (!new File(this.nbalignerfolder).exists()) {
        new File(this.nbalignerfolder).mkdir();
    }//  w  w  w.ja  v a  2s . c o m
    FileUtils.cleanDirectory(new File(this.nbalignerfolder));
    String nbsourcefolder = this.nbalignerfolder + File.separator + this.sourcelanguage;
    new File(nbsourcefolder).mkdir();
    org.dom4j.Document nbsource = DocumentHelper.createDocument();
    org.dom4j.Element root_src = nbsource.addElement("txml");
    root_src.addAttribute("locale", this.sourcelanguage);
    root_src.addAttribute("version", "1.0");
    root_src.addAttribute("segtype", "sentence");
    org.dom4j.Element translatable_src = root_src.addElement("translatable");
    translatable_src.addAttribute("blockId", "1");
    String nbtargetfolder = this.nbalignerfolder + File.separator + this.targetlanguage;
    new File(nbtargetfolder).mkdir();
    org.dom4j.Document nbtarget = DocumentHelper.createDocument();
    org.dom4j.Element root_trg = nbtarget.addElement("txml");
    root_trg.addAttribute("locale", this.targetlanguage);
    root_trg.addAttribute("version", "1.0");
    root_trg.addAttribute("segtype", "sentence");
    org.dom4j.Element translatable_trg = root_trg.addElement("translatable");
    translatable_trg.addAttribute("blockId", "0");
    if (new File(this.alignedfile).exists()) {
        new File(this.alignedfile).delete();
    }
    SegmenterFactory factory = new SegmenterFactory();
    Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage);
    Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig);

    org.dom4j.Document document = DocumentHelper.createDocument();
    org.dom4j.Element root = document.addElement("alinger");
    org.dom4j.Element head = root.addElement("head");
    head.addAttribute("src_lang", this.sourcelanguage);
    head.addAttribute("trg_lang", this.targetlanguage);
    head.addAttribute("creator", this.creatorid);
    org.dom4j.Element aligned = root.addElement("aligned");
    org.dom4j.Element orphans = root.addElement("orphans");

    org.dom4j.Document document_source_formatted_nonSeg = XmlParser
            .parseXmlFile(this.reformattedsourcetxlf_nonSeg);
    org.dom4j.Element root_source_formatted_nonSeg = document_source_formatted_nonSeg.getRootElement();
    List list_source_formatted_nonSeg = root_source_formatted_nonSeg.selectNodes("//*[name() = 'trans-unit']");

    org.dom4j.Document document_target_nonSeg = XmlParser.parseXmlFile(this.reformattedtargettxlf_nonSeg);
    org.dom4j.Element root_target_nonSeg = document_target_nonSeg.getRootElement();

    List list_target_nonSeg = root_target_nonSeg.selectNodes("//*[name() = 'trans-unit']");

    org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg);
    org.dom4j.Element root_target_seg = document_target_seg.getRootElement();

    List list_target_seg = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']");

    ExtractionSupportImpl extractionSupportImpl_src = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.sourcelanguage));
    Configuration config_src = new BaseConfiguration();
    config_src.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl_src.setConfiguration(config_src);

    ExtractionSupportImpl extractionSupportImpl_trg = new ExtractionSupportImpl(
            Locale.makeLocale(this.targetlanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config_trg = new BaseConfiguration();
    config_trg.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl_trg.setConfiguration(config_trg);

    boolean issrcfirsthf = true;
    boolean istrgfirsthf = true;
    int gcount = -1;
    int segmentId = 0;
    for (int i = 0; i < list_source_formatted_nonSeg.size(); i++) {
        org.dom4j.Element src_txlf = ((org.dom4j.Element) list_source_formatted_nonSeg.get(i))
                .element("source");
        String merged_text = getTxlfElementText_withFakeTC(src_txlf);
        if (extractionSupportImpl_src
                .isExtractable(merged_text.replace("&amp;paradel;", "").replace("&amp;parains;", ""))) {
            gcount++;

            org.dom4j.Element group = aligned.addElement("group");
            group.addAttribute("id", Integer.toString(gcount));

            merged_text = trimText(merged_text, true)[0];
            org.dom4j.Element merged_src_text = group.addElement("text");

            merged_src_text.setText(merged_text.replace("&amp;paradel;", "").replace("&amp;parains;", ""));

            String[] split_merged_text = merged_text.replaceAll("(&amp;paradel;)+", "&amp;paradel;")
                    .replaceAll("^&amp;paradel;", "").replaceAll("&amp;paradel;$", "").split("&amp;paradel;");
            List<String> segmentsGroup = segmentStringWithRevs(
                    merged_text.replaceAll("(&amp;paradel;)+", "&amp;paradel;").replace("&amp;parains;", ""),
                    this.sourcelanguage);
            List<List<String>> resegmentedGroup = new ArrayList();
            resegmentedGroup.add(new ArrayList());
            int idx = 0;
            String orgs;
            String[] newsegs;
            for (int s = 0; s < segmentsGroup.size(); s++) {
                orgs = (String) segmentsGroup.get(s);
                if (orgs.contains("&amp;paradel;")) {
                    newsegs = orgs.split("&amp;paradel;");
                    for (int ss = 0; ss < newsegs.length; ss++) {
                        String sss = newsegs[ss];
                        if (!sss.trim().equals("")) {
                            ((List) resegmentedGroup.get(idx)).add(fixMissingTags(sss));
                        }
                        if ((((List) resegmentedGroup.get(idx)).size() != 0) && (ss != newsegs.length - 1)) {
                            resegmentedGroup.add(new ArrayList());
                            idx++;
                        }
                    }
                    if (orgs.trim().endsWith("&amp;paradel;")) {
                        resegmentedGroup.add(new ArrayList());
                        idx++;
                    }
                } else {
                    ((List) resegmentedGroup.get(idx)).add(fixMissingTags(orgs));
                }
            }
            if (split_merged_text.length > resegmentedGroup.size()) {
                System.out.println(i);
                System.out.println("merged_text: " + merged_text);
                for (String smt : split_merged_text) {
                    System.out.println("split_merged_text: " + smt);
                }
                for (List<String> smts : resegmentedGroup) {
                    System.out.println("resegmentedGroup: " + smts);
                }
                for (String smtss : segmentsGroup) {
                    System.out.println("segmentedGroup: " + smtss);
                }
            }
            for (int j = 0; j < split_merged_text.length; j++) {
                if (!split_merged_text[j].replaceAll("<(/)*ins>|<(/)*del>", "").trim().equals("")) {
                    split_merged_text[j] = fixMissingTags(split_merged_text[j]);

                    Element unit = group.addElement("unit");
                    unit.addAttribute("id", Integer.toString(j));
                    unit.addAttribute("alignsegs", "false");

                    Element src = unit.addElement("src_para");
                    org.dom4j.Element src_text = src.addElement("text");
                    boolean isAddedPara = split_merged_text[j].contains("&amp;parains;");
                    src.addAttribute("added", "" + isAddedPara);
                    String[] trim_result = trimText(split_merged_text[j].replace("&amp;parains;", ""), false);
                    src.addAttribute("lefttrim", trim_result[1]);
                    src.addAttribute("righttrim", trim_result[2]);
                    split_merged_text[j] = trim_result[0];

                    int src_tctype_para = TrackChangeHelper.getTrackChangeType(split_merged_text[j]);
                    src.addAttribute("tctype", TrackChangeType.getName(src_tctype_para));
                    String rejected_src = split_merged_text[j].replaceAll("(?s)<ins>.*?</ins>", "")
                            .replace("<del>", "").replace("</del>", "");
                    if (!extractionSupportImpl_src.isExtractable(rejected_src)) {
                        unit.addAttribute("locked", "true");
                    } else {
                        unit.addAttribute("locked", "false");
                    }
                    src_text.setText(split_merged_text[j]);

                    org.dom4j.Element src_segs = src.addElement("segments");
                    List<String> segments = (List) resegmentedGroup.get(j);
                    for (int z = 0; z < segments.size(); z++) {
                        String segment_text = trimText((String) segments.get(z), false)[0];
                        org.dom4j.Element src_seg = src_segs.addElement("src_seg");
                        src_seg.addAttribute("id", Integer.toString(z));
                        src_seg.addAttribute("needreview", "false");
                        src_seg.addAttribute("ignored", "false");
                        int tctype_seg = TrackChangeHelper.getTrackChangeType(segment_text);
                        src_seg.addAttribute("tctype", TrackChangeType.getName(tctype_seg));
                        String accepted_t = segment_text.replaceAll("(?s)<del>.*?</del>", "")
                                .replace("<ins>", "").replace("</ins>", "");
                        src_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl_src.isExtractable(accepted_t)));

                        String rejected_s = segment_text.replaceAll("(?s)<ins>.*?</ins>", "")
                                .replace("<del>", "").replace("</del>", "");
                        if (!extractionSupportImpl_src.isExtractable(rejected_s)) {
                            src_seg.addAttribute("locked", "true");
                        } else {
                            src_seg.addAttribute("locked", "false");

                            org.dom4j.Element segment_src = translatable_src.addElement("segment");
                            segment_src.addAttribute("segmentId", Integer.toString(segmentId));

                            srcidmap.put(i + " - " + j + " - " + z, Integer.toString(segmentId));

                            segmentId++;
                            segment_src.addElement("source").setText(rejected_s);
                        }
                        src_seg.setText(segment_text);
                    }
                }
            }
        }
    }
    segmentId = 0;
    for (int i = 0; i < list_target_nonSeg.size(); i++) {
        org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(i)).element("source");

        String trg_formatted_text = getTxlfElementText_normal(trg_txlf);
        List<String> trgsegs = segmentStringWithRevs(trg_formatted_text, this.targetlanguage);
        for (int j = 0; j < trgsegs.size(); j++) {
            String trgseg = ((String) trgsegs.get(j)).trim().replaceAll("(\\s)+", " ");
            if (extractionSupportImpl_trg.isExtractable(trgseg)) {
                org.dom4j.Element segment_trg = translatable_trg.addElement("segment");
                segment_trg.addAttribute("segmentId", Integer.toString(segmentId));
                segmentId++;
                segment_trg.addElement("source").setText(trgseg);
            }
        }
    }
    OutputStreamWriter writer = new OutputStreamWriter(
            new BufferedOutputStream(
                    new FileOutputStream(nbsourcefolder + File.separator + this.sourcelanguage + ".txml")),
            "UTF8");
    nbsource.write(writer);
    writer.close();

    writer = new OutputStreamWriter(
            new BufferedOutputStream(
                    new FileOutputStream(nbtargetfolder + File.separator + this.targetlanguage + ".txml")),
            "UTF8");
    nbtarget.write(writer);
    writer.close();

    String pahtexe = "\\\\10.2.50.190\\AutoAlignerCLI\\AutoAlignerCLI.exe";

    ProcessBuilder pb = new ProcessBuilder(
            new String[] { pahtexe, "-i", this.nbalignerfolder, "-o", this.nbalignerfolder, "-lang_pairs",
                    this.sourcelanguage + "_" + this.targetlanguage, "-lang_detect", "normal", "-identicals",
                    "-match_filenames", "-txml_or_xmx_output", "-docnames_output", "-disallow_src_merging" });
    pb.redirectErrorStream(true);

    Process p = pb.start();
    InputStreamReader isr = new InputStreamReader(p.getInputStream());
    BufferedReader br = new BufferedReader(isr);

    boolean sentESTTime = false;
    boolean alignstart = false;
    String lineRead;
    while ((lineRead = br.readLine()) != null) {
        System.out.println(lineRead);
        if (lineRead.contains("Aligning...")) {
            alignstart = true;
        } else {
            if ((lineRead.contains("Estimated Time to Completion:")) && (alignstart)) {
                this.estimateNBAlignerCompTime = lineRead.replace("Estimated Time to Completion: ", "")
                        .replace(" Minute(s)", "");
            }
            if ((!this.estimateNBAlignerCompTime.equals("")) && (!sentESTTime)) {
                sentESTTime = true;
                try {
                    int minutes = 200 + Integer.parseInt(this.estimateNBAlignerCompTime);
                    setAlignProgress(prjid, minutes);
                    this.estimateNBAlignerCompTime = "";
                } catch (Exception ex) {
                    ex.printStackTrace();
                }
            }
        }
    }
    p.waitFor();

    for (File file : new File(this.nbalignerfolder).listFiles()) {
        if (file.getName().endsWith(".zip")) {
            UnzipFile.UnZipIt(file.getAbsolutePath(), this.nbalignerfolder);
        }
    }
    String alignedtxml = "";
    for (File file : new File(this.nbalignerfolder).listFiles()) {
        if (file.getName().endsWith(".txml")) {
            alignedtxml = file.getAbsolutePath();
        }
    }
    if (alignedtxml.equals("")) {
        throw new Exception("file didn't aligned by nbaligner");
    }
    HashMap<String, String[]> alignedtrgs = new HashMap();
    List<String[]> missingtrgs = new ArrayList();
    int src_idx = -1;

    org.dom4j.Document alignedtxmldoc = XmlParser.parseXmlFile(alignedtxml);
    org.dom4j.Element root_alignedtxmldoc = alignedtxmldoc.getRootElement();
    for (int i = 0; i < root_alignedtxmldoc.elements("translatable").size(); i++) {
        org.dom4j.Element translatable = (org.dom4j.Element) root_alignedtxmldoc.elements("translatable")
                .get(i);
        for (int j = 0; j < translatable.elements("segment").size(); j++) {
            org.dom4j.Element segment = (org.dom4j.Element) translatable.elements("segment").get(j);
            org.dom4j.Element source = segment.element("source");
            org.dom4j.Element target = segment.element("target");
            if ((source != null) && (!source.getTextTrim().equals(""))) {
                src_idx++;
                if ((target != null) && (!target.getTextTrim().equals(""))) {
                    String matchscore = target.attributeValue("score");
                    int trg_idx = Integer.parseInt(target.attributeValue("sent_no"));
                    if (matchscore.equals("0")) {
                        alignedtrgs.put(Integer.toString(src_idx),
                                new String[] { target.getTextTrim(), "1", Integer.toString(trg_idx) });
                    } else if (target.attribute("original_segments_count") != null) {
                        int merged_cnt = Integer.parseInt(target.attributeValue("original_segments_count"));
                        String trg_idx_str = Integer.toString(trg_idx) + " - "
                                + Integer.toString(trg_idx + merged_cnt - 1);
                        alignedtrgs.put(Integer.toString(src_idx),
                                new String[] { target.getTextTrim(), matchscore, trg_idx_str });
                    } else {
                        alignedtrgs.put(Integer.toString(src_idx),
                                new String[] { target.getTextTrim(), matchscore, Integer.toString(trg_idx) });
                    }
                }
            } else if ((target != null) && (!target.getTextTrim().equals(""))) {
                String matchscore = target.attributeValue("score");
                int trg_idx = Integer.parseInt(target.attributeValue("sent_no"));

                missingtrgs.add(new String[] { target.getTextTrim(), Integer.toString(trg_idx) });
            }
        }
    }
    int null_idx = 0;
    List<org.dom4j.Element> groups = aligned.elements("group");
    for (int i = 0; i < groups.size(); i++) {
        org.dom4j.Element group = (org.dom4j.Element) groups.get(i);
        List<org.dom4j.Element> units = group.elements("unit");
        for (int j = 0; j < units.size(); j++) {
            org.dom4j.Element unit = (org.dom4j.Element) units.get(j);
            org.dom4j.Element src_para = unit.element("src_para");
            org.dom4j.Element src_para_segs = src_para.element("segments");
            org.dom4j.Element trg_para = unit.addElement("trg_para");
            org.dom4j.Element trg_para_segs = trg_para.addElement("segments");
            List<org.dom4j.Element> src_segs = src_para_segs.elements("src_seg");
            for (int z = 0; z < src_segs.size(); z++) {
                org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z);
                org.dom4j.Element trg_seg = trg_para_segs.addElement("trg_seg");

                String mapid = Integer.toString(i) + " - " + Integer.toString(j) + " - " + Integer.toString(z);
                trg_seg.addAttribute("edited", "false");
                String trgsegtext = "";
                if (srcidmap.containsKey(mapid)) {
                    String sourceidintxml = (String) srcidmap.get(mapid);
                    if (alignedtrgs.containsKey(sourceidintxml)) {
                        src_seg.addAttribute("locked", "true");
                        trgsegtext = ((String[]) alignedtrgs.get(sourceidintxml))[0];
                        String score = ((String[]) alignedtrgs.get(sourceidintxml))[1];
                        String targetidintxml = ((String[]) alignedtrgs.get(sourceidintxml))[2];
                        if (Integer.parseInt(score) < needreviewthreshhold) {
                            src_seg.addAttribute("needreview", "true");
                        }
                        trg_seg.addAttribute("id", targetidintxml);
                        trg_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl_trg.isExtractable(trgsegtext)));
                    } else {
                        trg_seg.addAttribute("id", "n - " + null_idx);
                        null_idx++;
                        trg_seg.addAttribute("isExtractable", "false");
                    }
                } else {
                    trg_seg.addAttribute("id", "n - " + null_idx);
                    null_idx++;
                    trg_seg.addAttribute("isExtractable", "false");
                }
                trg_seg.setText(trgsegtext);
            }
        }
    }
    org.dom4j.Element orp_unit = orphans.addElement("unit");
    orp_unit.addAttribute("id", "0");
    org.dom4j.Element orp_trg_para = orp_unit.addElement("trg_para");
    org.dom4j.Element orp_segments = orp_trg_para.addElement("segments");
    for (int i = 0; i < missingtrgs.size(); i++) {
        String orptrgtext = ((String[]) missingtrgs.get(i))[0];
        String orptrgid = ((String[]) missingtrgs.get(i))[1];
        org.dom4j.Element orp_trg_seg = orp_segments.addElement("trg_seg");
        orp_trg_seg.addAttribute("id", orptrgid);
        orp_trg_seg.addAttribute("edited", "false");
        orp_trg_seg.addAttribute("isExtractable",
                Boolean.toString(extractionSupportImpl_trg.isExtractable(orptrgtext)));
        orp_trg_seg.setText(orptrgtext);
    }
    OutputStreamWriter oswriter = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8");
    document.write(oswriter);
    oswriter.close();
}

From source file:revaligner.service.FileAligner.java

public void buildTargetContentMap() throws Exception {
    System.out.println("rebuilding target content map file....");

    ExtractionSupportImpl extractionSupportImpl_trg = new ExtractionSupportImpl(
            Locale.makeLocale(this.targetlanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config_trg = new BaseConfiguration();
    config_trg.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl_trg.setConfiguration(config_trg);

    this.txlftrgsegmap = new LinkedHashMap();
    this.txlftrgsewsmap = new LinkedHashMap();
    org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg);
    org.dom4j.Element root_target_seg = document_target_seg.getRootElement();
    List list_target_para = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']");
    int segmentId = 1;
    for (int i = 0; i < list_target_para.size(); i++) {
        org.dom4j.Element group = (org.dom4j.Element) list_target_para.get(i);
        for (int j = 0; j < group.elements("trans-unit").size(); j++) {
            org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) group.elements("trans-unit").get(j);
            String trgseg = trg_txlf_seg.element("source").getText().trim().replaceAll("(\\s)+", " ");
            if (extractionSupportImpl_trg.isExtractable(trgseg)) {
                List tmp_content = new ArrayList();
                for (int z = 0; z < trg_txlf_seg.content().size(); z++) {
                    if ((trg_txlf_seg.content().get(z) instanceof org.dom4j.Element)) {
                        tmp_content.add(trg_txlf_seg.content().get(z));
                    }//w  w w .j a  v a  2 s. c om
                }
                this.txlftrgsegmap.put(Integer.valueOf(segmentId), tmp_content);
                boolean[] seg_attr = { false, false };
                if (j == 0) {
                    seg_attr[0] = true;
                }
                if (j == group.elements("trans-unit").size() - 1) {
                    seg_attr[1] = true;
                }
                this.txlftrgsewsmap.put(Integer.valueOf(segmentId), seg_attr);
                segmentId++;
            }
        }
    }
}

From source file:revaligner.service.FileAligner.java

public boolean verifysegments() throws Exception {
    System.out.println("verifying segments mapping....");
    boolean isValid = false;
    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    org.dom4j.Document document_source = XmlParser.parseXmlFile(this.sourcetxlf_seg);
    org.dom4j.Element root_source = document_source.getRootElement();

    List list_source = root_source.selectNodes("//*[name() = 'trans-unit']");
    int numberOfPara_source = list_source.size();

    List<org.dom4j.Element> text_source = new ArrayList();
    Iterator iter_source = list_source.iterator();
    while (iter_source.hasNext()) {
        org.dom4j.Element source = ((org.dom4j.Element) iter_source.next()).element("source");
        text_source.add(source);// ww  w .  j av a2  s.c om
    }
    List<String> mergedsegtext = new ArrayList();
    org.dom4j.Document alignedxml = XmlParser.parseXmlFile(this.alignedfile);
    org.dom4j.Element root = alignedxml.getRootElement();
    List groups = root.selectNodes("//*[name() = 'group']");
    for (int i = 0; i < groups.size(); i++) {
        org.dom4j.Element group = (org.dom4j.Element) groups.get(i);
        List units = group.elements("unit");
        ArrayList<String> keys = new ArrayList();
        ArrayList<String> key_left = new ArrayList();
        ArrayList<String> key_right = new ArrayList();
        ArrayList<String> org_keys = new ArrayList();
        for (int j = 0; j < units.size(); j++) {
            org.dom4j.Element unit = (org.dom4j.Element) units.get(j);
            org.dom4j.Element src_para = unit.element("src_para");
            if (src_para != null) {
                List segs = src_para.element("segments").elements("src_seg");
                for (int z = 0; z < segs.size(); z++) {
                    org.dom4j.Element seg = (org.dom4j.Element) segs.get(z);
                    keys.add(seg.getText().replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", "")
                            .replace("<br>", "").replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
                            .trim());
                    org_keys.add(seg.getText());
                    if ((z == 0) && (z == segs.size() - 1)) {
                        key_left.add(src_para.attributeValue("lefttrim"));
                        key_right.add(src_para.attributeValue("righttrim"));
                    } else if (z == 0) {
                        key_left.add(src_para.attributeValue("lefttrim"));
                        key_right.add("true");
                    } else if (z == segs.size() - 1) {
                        key_left.add("true");
                        key_right.add(src_para.attributeValue("righttrim"));
                    } else {
                        key_left.add("true");
                        key_right.add("true");
                    }
                }
            }
        }
        SegmenterFactory factory = new SegmenterFactory();
        Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage);
        Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig);
        List<String> finsegs = segmenter.segment(
                group.elementText("text").replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", "")
                        .replace("<br>", "").replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&"));
        ArrayList<ArrayList<Integer>> indices = new ArrayList();
        int key_start_index = 0;
        ArrayList<Integer> indice;
        for (int k = 0; k < finsegs.size(); k++) {
            String finsegtext = (String) finsegs.get(k);

            String combined_key = "";
            indice = new ArrayList();
            for (int x = key_start_index; x < keys.size(); x++) {
                combined_key = combined_key + (String) keys.get(x);

                indice.add(Integer.valueOf(x));
                if (combined_key.replace("", " ").trim().replaceAll("(\\s)+", "")
                        .equals(finsegtext.replace("", " ").trim().replaceAll("(\\s)+", ""))) {
                    indices.add(indice);
                    key_start_index = x + 1;
                    break;
                }
            }
        }
        for (int m = 0; m < indices.size(); m++) {
            ArrayList<Integer> temp_indice = (ArrayList) indices.get(m);
            String temp = "";
            for (int it : temp_indice) {
                temp = temp + (String) org_keys.get(it);
            }
            mergedsegtext.add(temp);
        }
    }
    List<String> rejectedtexts = new ArrayList();
    Workbook wb = new Workbook();
    Worksheet ws = wb.getWorksheets().get(0);
    Cells cells = ws.getCells();
    int count = Math.max(text_source.size(), mergedsegtext.size());
    int t_count = 0;
    for (int i = 0; i < count; i++) {
        String t_src = "";
        String t_fom = "";
        if (i < text_source.size()) {
            org.dom4j.Element src = (org.dom4j.Element) text_source.get(i);
            for (int j = 0; j < src.content().size(); j++) {
                if ((src.content().get(j) instanceof org.dom4j.Text)) {
                    t_src = t_src + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&amp;")
                            .replace("<", "&lt;").replace(">", "&gt;");
                } else if ((src.content().get(j) instanceof org.dom4j.Element)) {
                    org.dom4j.Element e = (org.dom4j.Element) src.content().get(j);
                    if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) {
                        t_src = t_src + " ";
                    } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) {
                        t_src = t_src + "<br> ";
                    }
                }
            }
        }
        cells.get(i, 0).setHtmlString(t_src);
        if (i < mergedsegtext.size()) {
            t_fom = ((String) mergedsegtext.get(i)).replace("<ins>", "<u>").replace("</ins>", "</u>")
                    .replace("<del>", "<strike>").replace("</del>", "</strike>");
        }
        String accepted_t_fom = t_fom.replaceAll("(?s)<strike>.*?</strike>", "").replace("<u>", "")
                .replace("</u>", "").replace("&amp;paradel;", "").replace("<br>", "");
        String rejected_t_fom = t_fom.replaceAll("(?s)<u>.*?</u>", "").replace("<strike>", "")
                .replace("</strike>", "").replace("&amp;paradel;", "").replace("<br>", "");
        if (extractionSupportImpl.isExtractable(accepted_t_fom)) {
            String input = "<html>" + t_fom.replace("<br> ", "&#8629;<br>").replace("&amp;paradel;",
                    "<strike>&para;</strike><br>") + "</html>";

            cells.get(t_count, 1).setHtmlString(input);
            t_count++;
            if ((i < mergedsegtext.size())
                    && (TrackChangeHelper.getTrackChangeType((String) mergedsegtext.get(i)) == 3)) {
                rejectedtexts.add(rejected_t_fom);
            } else {
                rejectedtexts.add("");
            }
        }
    }
    wb.save(this.prjfolder + File.separator + "verifySegs.xlsx");
    if (numberOfPara_source == t_count) {
        System.out.println("result: TRUE source: " + numberOfPara_source + " formatted: " + t_count);
        isValid = true;

        String timestamp = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'").format(new Date());
        for (int r = 0; r < list_source.size(); r++) {
            if (!((String) rejectedtexts.get(r)).equals("")) {
                org.dom4j.Element transunit = (org.dom4j.Element) list_source.get(r);
                org.dom4j.Element originalbase = transunit.addElement("alt-trans");
                org.dom4j.Element source = transunit.element("source");
                org.dom4j.Element target = transunit.element("target");
                source.addAttribute("gs4tr:seginfo",
                        "<root username=\"TC Aligner\" timestamp=\"" + timestamp + "\"/>");
                if (target != null) {
                    transunit.elements().add(transunit.elements().indexOf(target) + 1, originalbase.clone());
                } else {
                    transunit.elements().add(transunit.elements().indexOf(source) + 1, originalbase.clone());
                }
                transunit.remove(originalbase);

                org.dom4j.Element original = transunit.element("alt-trans");
                original.addAttribute("alttranstype", "x-previous-source-version");
                original.addAttribute("gs4tr:seginfo",
                        "<root username=\"Original\" timestamp=\"" + timestamp + "\"/>");
                org.dom4j.Element original_source = original.addElement("source");
                original_source.addText((String) rejectedtexts.get(r));
                original.addElement("target");
            }
        }
        OutputStreamWriter writer = new OutputStreamWriter(
                new BufferedOutputStream(new FileOutputStream(this.sourcetxlf_seg)), "UTF8");
        document_source.write(writer);
        writer.close();
    } else {
        System.out.println("result: false source: " + numberOfPara_source + " formatted: " + t_count);
    }
    return isValid;
}

From source file:revaligner.service.FileAligner.java

public void align() throws Exception {
    org.dom4j.Document document = DocumentHelper.createDocument();
    org.dom4j.Element root = document.addElement("alinger");
    org.dom4j.Element head = root.addElement("head");
    head.addAttribute("src_lang", this.sourcelanguage);
    head.addAttribute("trg_lang", this.targetlanguage);
    head.addAttribute("creator", this.creatorid);

    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    org.dom4j.Element content = root.addElement("content");
    org.dom4j.Element orphans = root.addElement("orphans");

    com.aspose.words.Document doc_src = new com.aspose.words.Document(this.sourcefile);
    com.aspose.words.Document doc_trg = new com.aspose.words.Document(this.targetfile);

    doc_src.joinRunsWithSameFormatting();
    trimParaLeadingTrailingSpace(doc_src);

    UnlinkFields(doc_src);/* ww w . ja  v  a  2  s .c  om*/
    doc_src.save(this.sourcefile + ".docx");

    doc_trg.joinRunsWithSameFormatting();
    trimParaLeadingTrailingSpace(doc_trg);

    UnlinkFields(doc_trg);

    int seqnum = 0;
    int srcparaindex = 0;
    int srcparaindex_accept = 0;
    int trgparaindex = 0;
    int srcparacnt = doc_src.getChildNodes(8, true).getCount();
    int trgparacnt = doc_trg.getChildNodes(8, true).getCount();

    boolean ismovefrom = false;
    boolean ismoveto = false;
    boolean isprvdelpara = false;

    int prv = 999999;
    int unitid = 0;
    for (int i = 0; i < srcparacnt; i++) {
        Paragraph para_src = (Paragraph) doc_src.getChildNodes(8, true).get(i);

        String para_text = getParaText(para_src);

        boolean isExtractable = extractionSupportImpl.isExtractable(para_text);
        boolean isNumeric = org.gs4tr.foundation3.core.utils.Text.isNumeric(para_text);
        if ((!para_text.equals("")) && (isExtractable) && (!isNumeric)) {
            String src_para_text = "";
            boolean hasadds = false;
            boolean hasdels = false;
            boolean hasnorm = false;
            for (int j = 0; j < para_src.getChildNodes(0, true).getCount(); j++) {
                com.aspose.words.Node node = para_src.getChildNodes(0, true).get(j);
                if (node.getNodeType() == 13) {
                    ismovefrom = true;
                } else if (node.getNodeType() == 14) {
                    ismovefrom = false;
                } else if (node.getNodeType() == 15) {
                    ismoveto = true;
                } else if (node.getNodeType() == 16) {
                    ismoveto = false;
                } else if (node.getNodeType() == 21) {
                    Run run = (Run) para_src.getChildNodes(0, true).get(j);
                    if (!run.getFont().getName().equals("Wingdings")) {
                        if ((run.isInsertRevision()) && (!run.isDeleteRevision())) {
                            hasadds = true;
                            src_para_text = src_para_text + "<ins>"
                                    + run.getText().replace("<", "&lt;").replace(">", "&gt;") + "</ins>";
                        } else if (run.isDeleteRevision()) {
                            hasdels = true;
                            src_para_text = src_para_text + "<del>"
                                    + run.getText().replace("<", "&lt;").replace(">", "&gt;") + "</del>";
                        } else if (ismoveto) {
                            hasadds = true;
                            src_para_text = src_para_text + "<ins>"
                                    + run.getText().replace("<", "&lt;").replace(">", "&gt;") + "</ins>";
                        } else if (ismovefrom) {
                            hasdels = true;
                            src_para_text = src_para_text + "<del>"
                                    + run.getText().replace("<", "&lt;").replace(">", "&gt;") + "</del>";
                        } else {
                            hasnorm = true;
                            src_para_text = src_para_text
                                    + run.getText().replace("<", "&lt;").replace(">", "&gt;");
                        }
                    }
                }
            }
            org.dom4j.Element unit;
            org.dom4j.Element src_para;
            if ((hasadds) && (!hasdels) && (!hasnorm)) {
                unit = content.addElement("unit");
                src_para = unit.addElement("src_para");
                unit.addAttribute("id", Integer.toString(unitid++));
                src_para.addAttribute("para_type", "insertion");
            } else if ((!hasadds) && (!hasdels) && (hasnorm)) {
                unit = content.addElement("unit");
                src_para = unit.addElement("src_para");
                unit.addAttribute("id", Integer.toString(unitid++));
                src_para.addAttribute("para_type", "regular");
            } else if ((!hasadds) && (hasdels) && (!hasnorm)) {
                unit = content.addElement("unit");
                src_para = unit.addElement("src_para");
                unit.addAttribute("id", Integer.toString(unitid++));
                src_para.addAttribute("para_type", "deletion");
            } else {
                unit = content.addElement("unit");
                src_para = unit.addElement("src_para");
                unit.addAttribute("id", Integer.toString(unitid++));
                src_para.addAttribute("para_type", "mix");
            }
            src_para.addAttribute("para_seq", Integer.toString(srcparaindex));
            src_para.addAttribute("para_seq_acpt", Integer.toString(srcparaindex_accept));
            if (prv != srcparaindex_accept) {
            }
            prv = srcparaindex_accept;
            src_para.addText(wordToHtml(src_para_text));
            if (((hasdels) || (hasnorm)) && (!isNumeric)) {
                if (trgparaindex < trgparacnt) {
                    Paragraph para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(trgparaindex);
                    String para_trg_text = getParaText(para_trg);
                    boolean isExtractable_trg = extractionSupportImpl.isExtractable(para_trg_text);
                    if (trgparaindex == trgparacnt - 1) {
                        if ((!para_trg_text.equals("")) && (isExtractable_trg)) {
                            org.dom4j.Element trg_para = unit.addElement("trg_para");

                            String trg_para_text = "";
                            for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) {
                                Run run = (Run) para_trg.getChildNodes(21, true).get(j);
                                if (!run.getFont().getName().equals("Wingdings")) {
                                    trg_para_text = trg_para_text
                                            + run.getText().replace("<", "&lt;").replace(">", "&gt;");
                                }
                            }
                            trg_para.addAttribute("para_seq", Integer.toString(trgparaindex));
                            trg_para.addText(wordToHtml(trg_para_text));
                        }
                    } else {
                        while ((doc_trg.getChildNodes(8, true).get(trgparaindex).getText().trim().equals(""))
                                || (!extractionSupportImpl.isExtractable(
                                        doc_trg.getChildNodes(8, true).get(trgparaindex).getText()))
                                || (org.gs4tr.foundation3.core.utils.Text.isNumeric(
                                        doc_trg.getChildNodes(8, true).get(trgparaindex).getText().trim()))) {
                            trgparaindex++;
                            if (trgparaindex == trgparacnt - 1) {
                                break;
                            }
                        }
                        para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(trgparaindex);
                        para_trg_text = getParaText(para_trg);
                        if ((!para_trg_text.equals("")) && (extractionSupportImpl.isExtractable(para_trg_text))
                                && (!org.gs4tr.foundation3.core.utils.Text.isNumeric(para_trg_text))) {
                            org.dom4j.Element trg_para = unit.addElement("trg_para");

                            String trg_para_text = "";
                            for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) {
                                Run run = (Run) para_trg.getChildNodes(21, true).get(j);
                                if (!run.getFont().getName().equals("Wingdings")) {
                                    trg_para_text = trg_para_text
                                            + run.getText().replace("<", "&lt;").replace(">", "&gt;");
                                }
                            }
                            trg_para.addAttribute("para_seq", Integer.toString(trgparaindex));
                            trg_para.addText(wordToHtml(trg_para_text));
                        }
                    }
                }
                trgparaindex++;
            }
            if (para_src.isDeleteRevision()) {
                isprvdelpara = true;
            } else {
                seqnum++;
                isprvdelpara = false;
            }
            if ((para_src.isEndOfCell()) || (para_src.isEndOfHeaderFooter()) || (para_src.isEndOfSection())) {
                seqnum++;
                isprvdelpara = false;
            }
            srcparaindex++;
            if (!isprvdelpara) {
                srcparaindex_accept++;
            }
        } else {
            for (int j = 0; j < para_src.getChildNodes(0, true).getCount(); j++) {
                com.aspose.words.Node node = para_src.getChildNodes(0, true).get(j);
                if (node.getNodeType() == 13) {
                    ismovefrom = true;
                } else if (node.getNodeType() == 14) {
                    ismovefrom = false;
                } else if (node.getNodeType() == 15) {
                    ismoveto = true;
                } else if (node.getNodeType() == 16) {
                    ismoveto = false;
                }
            }
            if (!para_src.isDeleteRevision()) {
                seqnum++;
                isprvdelpara = false;

                srcparaindex_accept++;
            }
            srcparaindex++;
        }
    }
    if (trgparaindex < trgparacnt) {
        for (int i = trgparaindex; i < trgparacnt; i++) {
            Paragraph para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(i);
            if ((!para_trg.getText().trim().equals(""))
                    && (extractionSupportImpl.isExtractable(para_trg.getText()))) {
                org.dom4j.Element unit = content.addElement("unit");
                unit.addAttribute("id", Integer.toString(seqnum));
                org.dom4j.Element trg_para = unit.addElement("trg_para");

                String trg_para_text = "";
                for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) {
                    Run run = (Run) para_trg.getChildNodes(21, true).get(j);
                    if (!run.getFont().getName().equals("Wingdings")) {
                        trg_para_text = trg_para_text + run.getText().replace("<", "&lt;").replace(">", "&gt;");
                    }
                }
                trg_para.addAttribute("para_seq", Integer.toString(trgparaindex));
                trg_para.addText(wordToHtml(trg_para_text));

                seqnum++;
                trgparaindex++;
            }
        }
    }
    this.alignedfile = (new File(new File(this.sourcefile).getParent()).getParent() + "/rev_aligned.xml");
    if (new File(this.alignedfile).exists()) {
        new File(this.alignedfile).delete();
    }
    OutputStreamWriter writer = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8");
    document.write(writer);
    writer.close();
}

From source file:revaligner.service.FileAligner.java

public void update(JSONArray arr, JSONArray missings, JSONArray locks, JSONArray segaligned, JSONArray targets,
        JSONArray missing_targets, int cnt) throws Exception {
    File alignedFile = new File(this.alignedfile);
    if (!alignedFile.exists()) {
        throw new FileNotFoundException("Could not find aligned xml file");
    }//from  w w  w . j a va 2 s  . com
    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    List<JSONArray> trg_list = new ArrayList();
    for (int i = 0; i < arr.length(); i++) {
        trg_list.add(arr.getJSONArray(i));
    }
    List<JSONArray> missings_list = new ArrayList();
    for (int i = 0; i < missings.length(); i++) {
        missings_list.add(missings.getJSONArray(i));
    }
    List<String> locks_list = new ArrayList();
    for (int i = 0; i < locks.length(); i++) {
        locks_list.add(locks.getString(i));
    }
    List<String> segaligned_list = new ArrayList();
    for (int i = 0; i < segaligned.length(); i++) {
        segaligned_list.add(segaligned.getString(i));
    }
    this.nullcnt = cnt;

    int unitcnt = 0;
    org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile);
    org.dom4j.Element root = document.getRootElement();
    List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group");
    for (int i = 0; i < groups.size(); i++) {
        org.dom4j.Element group = (org.dom4j.Element) groups.get(i);

        List<org.dom4j.Element> units = group.elements("unit");
        for (int j = 0; j < units.size(); j++) {
            unitcnt++;
            org.dom4j.Element unit = (org.dom4j.Element) units.get(j);
            String seq = i + " - " + j;
            if (locks_list.contains(seq)) {
                unit.attribute("locked").setValue("true");
            } else {
                unit.attribute("locked").setValue("false");
            }
            if (segaligned_list.contains(seq)) {
                unit.attribute("alignsegs").setValue("true");
            } else {
                unit.attribute("alignsegs").setValue("false");
            }
            if (unitcnt <= trg_list.size()) {
                JSONArray segs = (JSONArray) trg_list.get(unitcnt - 1);
                String trg_para_text = targets.getString(unitcnt - 1);

                org.dom4j.Element trg_para = unit.element("trg_para");
                if (trg_para != null) {
                    trg_para.clearContent();
                } else {
                    trg_para = unit.addElement("trg_para");
                }
                trg_para.addAttribute("id", segs.getString(0));
                org.dom4j.Element trg_text = trg_para.addElement("text");
                org.dom4j.Element trg_segs = trg_para.addElement("segments");
                for (int s = 1; s < segs.length(); s++) {
                    org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg");
                    trg_seg.addAttribute("id", Integer.toString(s - 1));
                    trg_seg.addAttribute("isExtractable",
                            Boolean.toString(extractionSupportImpl.isExtractable(segs.getString(s))));
                    trg_seg.setText(decodehtmlstring(segs.getString(s)));
                }
                trg_text.setText(decodehtmlstring(trg_para_text));
            } else {
                units.remove(j);
                j--;
            }
        }
        if (group.elements("unit").size() == 0) {
            groups.remove(i);
            i--;
        }
    }
    org.dom4j.Element orphans = root.element("orphans");
    orphans.clearContent();
    for (int i = 0; i < missings_list.size(); i++) {
        JSONArray segs = (JSONArray) missings_list.get(i);
        String trg_para_text = missing_targets.getString(i);

        org.dom4j.Element unit = orphans.addElement("unit");
        unit.addAttribute("id", Integer.toString(i));
        org.dom4j.Element trg_para = unit.addElement("trg_para");
        trg_para.addAttribute("id", segs.getString(0));
        org.dom4j.Element trg_text = trg_para.addElement("text");
        org.dom4j.Element trg_segs = trg_para.addElement("segments");
        for (int s = 1; s < segs.length(); s++) {
            org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg");
            trg_seg.addAttribute("id", Integer.toString(s));
            trg_seg.addAttribute("isExtractable",
                    Boolean.toString(extractionSupportImpl.isExtractable(segs.getString(s))));
            trg_seg.setText(decodehtmlstring(segs.getString(s)));
        }
        trg_text.setText(decodehtmlstring(trg_para_text));
    }
    new File(this.alignedfile).delete();
    OutputStreamWriter writer = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8");
    document.write(writer);
    writer.close();
}

From source file:revaligner.service.FileAligner.java

public void update_seg(JSONArray targets, JSONArray trg_seqs, JSONArray missing_targets,
        JSONArray missing_trg_seqs, JSONArray locks, int cnt, JSONArray edited, JSONArray review,
        JSONArray ignore) throws Exception {
    File alignedFile = new File(this.alignedfile);
    if (!alignedFile.exists()) {
        throw new FileNotFoundException("Could not find aligned xml file");
    }//from   w w  w  . j  a v a2 s.c  o  m
    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    List<String> trg_list = new ArrayList();
    for (int i = 0; i < targets.length(); i++) {
        trg_list.add(targets.getString(i));
    }
    List<String> trg_seq_list = new ArrayList();
    for (int i = 0; i < trg_seqs.length(); i++) {
        trg_seq_list.add(trg_seqs.getString(i));
    }
    List<String> missing_list = new ArrayList();
    for (int i = 0; i < missing_targets.length(); i++) {
        missing_list.add(missing_targets.getString(i));
    }
    List<String> missing_seq_list = new ArrayList();
    for (int i = 0; i < missing_trg_seqs.length(); i++) {
        missing_seq_list.add(missing_trg_seqs.getString(i));
    }
    List<String> locks_list = new ArrayList();
    for (int i = 0; i < locks.length(); i++) {
        locks_list.add(locks.getString(i));
    }
    this.nullcnt = cnt;

    List<String> edited_list = new ArrayList();
    for (int i = 0; i < edited.length(); i++) {
        edited_list.add(edited.getString(i));
    }
    List<String> review_list = new ArrayList();
    for (int i = 0; i < review.length(); i++) {
        review_list.add(review.getString(i));
    }
    List<String> ignore_list = new ArrayList();
    for (int i = 0; i < ignore.length(); i++) {
        ignore_list.add(ignore.getString(i));
    }
    int segcnt = 0;
    org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile);
    org.dom4j.Element root = document.getRootElement();
    List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group");
    for (int i = 0; i < groups.size(); i++) {
        org.dom4j.Element group = (org.dom4j.Element) groups.get(i);

        List<org.dom4j.Element> units = group.elements("unit");
        for (int j = 0; j < units.size(); j++) {
            org.dom4j.Element unit = (org.dom4j.Element) units.get(j);
            unit.addAttribute("alignsegs", "true");
            org.dom4j.Element src_para = unit.element("src_para");
            if (src_para != null) {
                List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg");
                for (int z = 0; z < src_segs.size(); z++) {
                    org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z);
                    String seq = i + " - " + j + " - " + z;
                    if (locks_list.contains(seq)) {
                        src_seg.attribute("locked").setValue("true");
                    } else {
                        src_seg.attribute("locked").setValue("false");
                    }
                    if (review_list.contains(seq)) {
                        src_seg.attribute("needreview").setValue("true");
                    } else {
                        src_seg.attribute("needreview").setValue("false");
                    }
                    if (ignore_list.contains(seq)) {
                        src_seg.attribute("ignored").setValue("true");
                    } else {
                        src_seg.attribute("ignored").setValue("false");
                    }
                }
                org.dom4j.Element trg_para = unit.element("trg_para");
                trg_para.remove(trg_para.element("text"));
                List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg");
                for (int z = 0; z < trg_segs.size(); z++) {
                    org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z);
                    if (z >= src_segs.size()) {
                        trg_para.element("segments").remove(trg_seg);
                    } else if (segcnt < trg_list.size()) {
                        trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt)));
                        if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt))))
                                || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) {
                            trg_seg.addAttribute("edited", "true");
                        }
                        String text = decodehtmlstring((String) trg_list.get(segcnt));
                        trg_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl.isExtractable(text)));
                        trg_seg.setText(text);
                        segcnt++;
                    } else {
                        trg_para.element("segments").remove(trg_seg);
                    }
                }
                if (trg_para.element("segments").elements("trg_seg").size() == 0) {
                    group.remove(unit);
                }
            } else {
                org.dom4j.Element trg_para = unit.element("trg_para");
                trg_para.remove(trg_para.element("text"));
                List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg");
                for (int z = 0; z < trg_segs.size(); z++) {
                    org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z);
                    if (segcnt < trg_list.size()) {
                        trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt)));
                        if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt))))
                                || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) {
                            trg_seg.addAttribute("edited", "true");
                        }
                        String text = decodehtmlstring((String) trg_list.get(segcnt));
                        trg_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl.isExtractable(text)));
                        trg_seg.setText(text);
                        segcnt++;
                    } else {
                        trg_para.element("segments").remove(trg_seg);
                    }
                }
                if (trg_para.element("segments").elements("trg_seg").size() == 0) {
                    group.remove(unit);
                }
            }
        }
        if (group.elements("unit").size() == 0) {
            groups.remove(i);
            i--;
        }
    }
    if (segcnt < trg_list.size()) {
        org.dom4j.Element group = root.element("aligned").addElement("group");
        group.addAttribute("id", Integer.toString(groups.size()));
        org.dom4j.Element unit = group.addElement("unit");
        unit.addAttribute("id", "0");
        org.dom4j.Element trg_para = unit.addElement("trg_para");
        trg_para.addAttribute("id", Integer.toString(groups.size()) + " - 0");
        org.dom4j.Element trgsegs = trg_para.addElement("segments");
        for (int x = segcnt; x < trg_list.size(); x++) {
            String text = decodehtmlstring((String) trg_list.get(x));
            org.dom4j.Element trg_seg = trgsegs.addElement("trg_seg");
            trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(x)));
            if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(x))))
                    || (decodehtmlstring((String) trg_seq_list.get(x)).startsWith("n - "))) {
                trg_seg.addAttribute("edited", "true");
            }
            trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text)));
            trg_seg.setText(text);
        }
    }
    org.dom4j.Element orphans = root.element("orphans");
    orphans.clearContent();
    org.dom4j.Element unit = orphans.addElement("unit");
    unit.addAttribute("id", "0");
    org.dom4j.Element trg_para = unit.addElement("trg_para");
    trg_para.addAttribute("id", "0 - 0");
    org.dom4j.Element trg_segs = trg_para.addElement("segments");
    for (int i = 0; i < missing_list.size(); i++) {
        org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg");
        String text = decodehtmlstring((String) missing_list.get(i));
        trg_seg.addAttribute("id", decodehtmlstring((String) missing_seq_list.get(i)));
        if ((edited_list.contains(decodehtmlstring((String) missing_seq_list.get(i))))
                || (decodehtmlstring((String) trg_seq_list.get(i)).startsWith("n - "))) {
            trg_seg.addAttribute("edited", "true");
        }
        trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text)));
        trg_seg.setText(text);
    }
    new File(this.alignedfile).delete();
    OutputStreamWriter writer = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8");
    document.write(writer);
    writer.close();
    if (new File(this.auto_saved_alignedfile).exists()) {
        new File(this.auto_saved_alignedfile).delete();
    }
}

From source file:revaligner.service.FileAligner.java

public void auto_update_seg(JSONArray targets, JSONArray trg_seqs, JSONArray missing_targets,
        JSONArray missing_trg_seqs, JSONArray locks, int cnt, JSONArray edited, JSONArray review,
        JSONArray ignore) throws Exception {
    File alignedFile = new File(this.alignedfile);
    if (!alignedFile.exists()) {
        throw new FileNotFoundException("Could not find aligned xml file");
    }//w ww.j  av a2 s  .  c o  m
    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    List<String> trg_list = new ArrayList();
    for (int i = 0; i < targets.length(); i++) {
        trg_list.add(targets.getString(i));
    }
    List<String> trg_seq_list = new ArrayList();
    for (int i = 0; i < trg_seqs.length(); i++) {
        trg_seq_list.add(trg_seqs.getString(i));
    }
    List<String> missing_list = new ArrayList();
    for (int i = 0; i < missing_targets.length(); i++) {
        missing_list.add(missing_targets.getString(i));
    }
    List<String> missing_seq_list = new ArrayList();
    for (int i = 0; i < missing_trg_seqs.length(); i++) {
        missing_seq_list.add(missing_trg_seqs.getString(i));
    }
    List<String> locks_list = new ArrayList();
    for (int i = 0; i < locks.length(); i++) {
        locks_list.add(locks.getString(i));
    }
    this.nullcnt = cnt;

    List<String> edited_list = new ArrayList();
    for (int i = 0; i < edited.length(); i++) {
        edited_list.add(edited.getString(i));
    }
    List<String> review_list = new ArrayList();
    for (int i = 0; i < review.length(); i++) {
        review_list.add(review.getString(i));
    }
    List<String> ignore_list = new ArrayList();
    for (int i = 0; i < ignore.length(); i++) {
        ignore_list.add(ignore.getString(i));
    }
    int segcnt = 0;
    org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile);
    org.dom4j.Element root = document.getRootElement();
    List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group");
    for (int i = 0; i < groups.size(); i++) {
        org.dom4j.Element group = (org.dom4j.Element) groups.get(i);

        List<org.dom4j.Element> units = group.elements("unit");
        for (int j = 0; j < units.size(); j++) {
            org.dom4j.Element unit = (org.dom4j.Element) units.get(j);
            unit.addAttribute("alignsegs", "true");
            org.dom4j.Element src_para = unit.element("src_para");
            if (src_para != null) {
                List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg");
                for (int z = 0; z < src_segs.size(); z++) {
                    org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z);
                    String seq = i + " - " + j + " - " + z;
                    if (locks_list.contains(seq)) {
                        src_seg.attribute("locked").setValue("true");
                    } else {
                        src_seg.attribute("locked").setValue("false");
                    }
                    if (review_list.contains(seq)) {
                        src_seg.attribute("needreview").setValue("true");
                    } else {
                        src_seg.attribute("needreview").setValue("false");
                    }
                    if (ignore_list.contains(seq)) {
                        src_seg.attribute("ignored").setValue("true");
                    } else {
                        src_seg.attribute("ignored").setValue("false");
                    }
                }
                org.dom4j.Element trg_para = unit.element("trg_para");
                trg_para.remove(trg_para.element("text"));
                List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg");
                for (int z = 0; z < trg_segs.size(); z++) {
                    org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z);
                    if (z >= src_segs.size()) {
                        trg_para.element("segments").remove(trg_seg);
                    } else if (segcnt < trg_list.size()) {
                        trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt)));
                        if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt))))
                                || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) {
                            trg_seg.addAttribute("edited", "true");
                        }
                        String text = decodehtmlstring((String) trg_list.get(segcnt));
                        trg_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl.isExtractable(text)));
                        trg_seg.setText(text);
                        segcnt++;
                    } else {
                        trg_para.element("segments").remove(trg_seg);
                    }
                }
                if (trg_para.element("segments").elements("trg_seg").size() == 0) {
                    group.remove(unit);
                }
            } else {
                org.dom4j.Element trg_para = unit.element("trg_para");
                trg_para.remove(trg_para.element("text"));
                List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg");
                for (int z = 0; z < trg_segs.size(); z++) {
                    org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z);
                    if (segcnt < trg_list.size()) {
                        trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt)));
                        if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt))))
                                || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) {
                            trg_seg.addAttribute("edited", "true");
                        }
                        String text = decodehtmlstring((String) trg_list.get(segcnt));
                        trg_seg.addAttribute("isExtractable",
                                Boolean.toString(extractionSupportImpl.isExtractable(text)));
                        trg_seg.setText(text);
                        segcnt++;
                    } else {
                        trg_para.element("segments").remove(trg_seg);
                    }
                }
                if (trg_para.element("segments").elements("trg_seg").size() == 0) {
                    group.remove(unit);
                }
            }
        }
        if (group.elements("unit").size() == 0) {
            groups.remove(i);
            i--;
        }
    }
    if (segcnt < trg_list.size()) {
        org.dom4j.Element group = root.element("aligned").addElement("group");
        group.addAttribute("id", Integer.toString(groups.size()));
        org.dom4j.Element unit = group.addElement("unit");
        unit.addAttribute("id", "0");
        org.dom4j.Element trg_para = unit.addElement("trg_para");
        org.dom4j.Element trgsegs = trg_para.addElement("segments");
        for (int x = segcnt; x < trg_list.size(); x++) {
            String text = decodehtmlstring((String) trg_list.get(x));
            org.dom4j.Element trg_seg = trgsegs.addElement("trg_seg");
            trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(x)));
            if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(x))))
                    || (decodehtmlstring((String) trg_seq_list.get(x)).startsWith("n - "))) {
                trg_seg.addAttribute("edited", "true");
            }
            trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text)));
            trg_seg.setText(text);
        }
    }
    org.dom4j.Element orphans = root.element("orphans");
    orphans.clearContent();
    org.dom4j.Element unit = orphans.addElement("unit");
    unit.addAttribute("id", "0");
    org.dom4j.Element trg_para = unit.addElement("trg_para");
    org.dom4j.Element trg_segs = trg_para.addElement("segments");
    for (int i = 0; i < missing_list.size(); i++) {
        org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg");
        String text = decodehtmlstring((String) missing_list.get(i));
        trg_seg.addAttribute("id", decodehtmlstring((String) missing_seq_list.get(i)));
        if ((edited_list.contains(decodehtmlstring((String) missing_seq_list.get(i))))
                || (decodehtmlstring((String) trg_seq_list.get(i)).startsWith("n - "))) {
            trg_seg.addAttribute("edited", "true");
        }
        trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text)));
        trg_seg.setText(text);
    }
    this.auto_saved_alignedfile = (this.alignedfile + ".temp");
    if (new File(this.auto_saved_alignedfile).exists()) {
        new File(this.auto_saved_alignedfile).delete();
    }
    OutputStreamWriter writer = new OutputStreamWriter(
            new BufferedOutputStream(new FileOutputStream(this.auto_saved_alignedfile)), "UTF8");
    document.write(writer);
    writer.close();
}

From source file:revaligner.service.FileAligner.java

public String convertFileToTxlf(String filePath, boolean segmentParagraph, String sourcelanguage)
        throws Exception {
    com.aspose.words.Document doc = new com.aspose.words.Document(filePath);
    doc.joinRunsWithSameFormatting();/*from   ww w  .  j av a 2  s.c  o m*/
    doc.save(filePath);

    ArrayList<String> srcs = new ArrayList();
    srcs.add(filePath);

    String orgtxlfname = filePath + ".txlf";
    if (new File(orgtxlfname).exists()) {
        new File(orgtxlfname).delete();
    }
    Locale locale = Locale.makeLocale(sourcelanguage);
    Configuration config = new BaseConfiguration();
    config.setProperty("word.acceptTrackChanges", "true");
    config.setProperty("word.extractDropDownList", "false");
    config.setProperty("word.extractEquations", "false");
    config.setProperty("word.extractComments", "false");
    config.setProperty("extraction.tokens.extract", "all");
    config.setProperty("word.translateHyperlinkText", "true");
    config.setProperty("word.translateHyperlinkValue", "false");
    config.setProperty("word.ignoreBiLingualStyles", "true");
    ConvertDOC converter = new ConvertDOC();
    converter.setConfiguration(config);
    converter.setIgnoreSuccessfullConversion(true);

    converter.convert(srcs, locale);

    segmentTxlf(orgtxlfname, segmentParagraph, sourcelanguage);

    return orgtxlfname;
}

From source file:revaligner.service.FileAligner.java

public boolean verifyParas() throws Exception {
    System.out.println("verifying paragraphs mapping....");
    boolean isValid = false;
    ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl(
            Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage));
    Configuration config = new BaseConfiguration();
    config.setProperty("extraction.tokens.extract", "all");
    extractionSupportImpl.setConfiguration(config);

    org.dom4j.Document document_source = XmlParser.parseXmlFile(this.sourcetxlf_nonSeg);
    org.dom4j.Element root_source = document_source.getRootElement();

    org.dom4j.Document document_formatted = XmlParser.parseXmlFile(this.reformattedsourcetxlf_nonSeg);
    org.dom4j.Element root_formatted = document_formatted.getRootElement();

    List list_source = root_source.selectNodes("//*[name() = 'trans-unit']");
    int numberOfPara_source = list_source.size();

    List list_formatted = root_formatted.selectNodes("//*[name() = 'trans-unit']");
    int numberOfPara_formatted = list_formatted.size();

    List<org.dom4j.Element> text_source = new ArrayList();
    Iterator iter_source = list_source.iterator();
    while (iter_source.hasNext()) {
        org.dom4j.Element source = ((org.dom4j.Element) iter_source.next()).element("source");
        text_source.add(source);//  www.  ja v a  2 s.  co  m
    }
    List<org.dom4j.Element> text_formatted = new ArrayList();
    Iterator iter_formatted = list_formatted.iterator();
    while (iter_formatted.hasNext()) {
        org.dom4j.Element source = ((org.dom4j.Element) iter_formatted.next()).element("source");
        text_formatted.add(source);
    }
    Workbook wb = new Workbook();
    Worksheet ws = wb.getWorksheets().get(0);
    Cells cells = ws.getCells();
    int count = Math.max(text_source.size(), text_formatted.size());
    int t_count = 0;
    for (int i = 0; i < count; i++) {
        String t_src = "";
        String t_fom = "";
        if (i < text_source.size()) {
            org.dom4j.Element src = (org.dom4j.Element) text_source.get(i);
            for (int j = 0; j < src.content().size(); j++) {
                if ((src.content().get(j) instanceof org.dom4j.Text)) {
                    t_src = t_src + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&amp;")
                            .replace("<", "&lt;").replace(">", "&gt;");
                } else if ((src.content().get(j) instanceof org.dom4j.Element)) {
                    org.dom4j.Element e = (org.dom4j.Element) src.content().get(j);
                    if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) {
                        t_src = t_src + " ";
                    } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) {
                        t_src = t_src + "<br> ";
                    }
                }
            }
        }
        cells.get(i, 0).setHtmlString("<html>" + t_src.trim().replace("<br> ", "&#8629;<br>") + "</html>");
        if (i < text_formatted.size()) {
            org.dom4j.Element src = (org.dom4j.Element) text_formatted.get(i);

            ArrayList<String> node_ids = new ArrayList();
            for (int j = 0; j < src.content().size(); j++) {
                if ((src.content().get(j) instanceof org.dom4j.Text)) {
                    t_fom = t_fom + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&amp;")
                            .replace("<", "&lt;").replace(">", "&gt;");
                } else if ((src.content().get(j) instanceof org.dom4j.Element)) {
                    org.dom4j.Element e = (org.dom4j.Element) src.content().get(j);
                    if ((e.getName().equals("bx"))
                            && (e.attribute("ctype").getValue().equals("x-strike-through"))) {
                        t_fom = t_fom + "<strike>";
                        node_ids.add(e.attribute("rid").getValue());
                    } else if (e.getName().equals("ex")) {
                        if (node_ids.contains(e.attribute("rid").getValue())) {
                            t_fom = t_fom + "</strike>";
                            node_ids.remove(e.attribute("rid").getValue());
                        }
                    } else if ((e.getName().equals("bpt"))
                            && (e.attribute("ctype").getValue().equals("x-underlined"))
                            && (e.getText().contains("type=\"1\""))) {
                        t_fom = t_fom + "<u>";
                        node_ids.add(e.attribute("rid").getValue());
                    } else if (e.getName().equals("ept")) {
                        if (node_ids.contains(e.attribute("rid").getValue())) {
                            t_fom = t_fom + "</u>";
                            node_ids.remove(e.attribute("rid").getValue());
                        }
                    } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) {
                        t_fom = t_fom + " ";
                    } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) {
                        t_fom = t_fom + "<br> ";
                    }
                }
            }
            if (!t_fom.contains("<u>")) {
                if (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']")
                        .size() != 0) {
                    org.dom4j.Node node = (org.dom4j.Node) src
                            .selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']").get(0);
                    if (node.getText().contains("type=\"1\"")) {
                        t_fom = "<u>" + t_fom + "</u>";
                    }
                }
            } else if ((!t_fom.contains("<strike>"))
                    && (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-strike-through'][@pos = 'open']")
                            .size() != 0)) {
                t_fom = "<strike>" + t_fom + "</strike>";
            }
        }
        String accepted_t_fom = t_fom.replaceAll("(?s)<strike>.*?</strike>", "").replace("<u>", "")
                .replace("</u>", "").replace("&amp;paradel;", "").replace("&amp;parains;", "")
                .replace("&amp;hf;", "").replace("<br>", "");
        if (extractionSupportImpl.isExtractable(accepted_t_fom)) {
            String input = "<html>" + t_fom.replace("&amp;hf;", "").replace("&amp;parains;", "")
                    .replace("<br> ", "&#8629;<br>").replace("&amp;paradel;", "<strike>&para;</strike><br>")
                    + "</html>";
            cells.get(t_count, 1).setHtmlString(input);
            t_count++;
        }
    }
    wb.save(this.prjfolder + File.separator + "verifyParas.xlsx");
    if (numberOfPara_source == t_count) {
        System.out.println("result: TRUE source: " + numberOfPara_source + " formatted: " + t_count);
        isValid = true;
    } else {
        System.out.println("result: false source: " + numberOfPara_source + " formatted: " + t_count);
    }
    return isValid;
}