List of usage examples for org.apache.commons.configuration BaseConfiguration BaseConfiguration
BaseConfiguration
From source file:revaligner.service.FileAligner.java
public String[] createAlignedXML() throws Exception { System.out.println("creating aligned xml...."); String[] res = new String[1]; this.txlftrgsegmap = new LinkedHashMap(); this.alignedfile = (this.prjfolder + File.separator + "rev_aligned.xml"); this.reformattedtargetmapfile = (this.prjfolder + File.separator + "target_reformatted" + File.separator + ".mp"); StringBuffer sbmp = new StringBuffer(); if (new File(this.alignedfile).exists()) { new File(this.alignedfile).delete(); }// w ww. j a va2s .co m SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); org.dom4j.Document document = DocumentHelper.createDocument(); org.dom4j.Element root = document.addElement("alinger"); org.dom4j.Element head = root.addElement("head"); head.addAttribute("src_lang", this.sourcelanguage); head.addAttribute("trg_lang", this.targetlanguage); head.addAttribute("creator", this.creatorid); org.dom4j.Element aligned = root.addElement("aligned"); org.dom4j.Element orphans = root.addElement("orphans"); org.dom4j.Document document_source_formatted_nonSeg = XmlParser .parseXmlFile(this.reformattedsourcetxlf_nonSeg); org.dom4j.Element root_source_formatted_nonSeg = document_source_formatted_nonSeg.getRootElement(); List list_source_formatted_nonSeg = root_source_formatted_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_nonSeg = XmlParser.parseXmlFile(this.reformattedtargettxlf_nonSeg); org.dom4j.Element root_target_nonSeg = document_target_nonSeg.getRootElement(); List list_target_nonSeg = root_target_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg); org.dom4j.Element root_target_seg = document_target_seg.getRootElement(); List list_target_seg = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); int trg_para_count = 0; ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); Workbook wb = new Workbook(); Cells cells = wb.getWorksheets().get(0).getCells(); int cnt = 0; boolean issrcfirsthf = true; boolean istrgfirsthf = true; int gcount = -1; for (int i = 0; i < list_source_formatted_nonSeg.size(); i++) { org.dom4j.Element src_txlf = ((org.dom4j.Element) list_source_formatted_nonSeg.get(i)) .element("source"); String merged_text = getTxlfElementText_withFakeTC(src_txlf); if (extractionSupportImpl .isExtractable(merged_text.replace("&paradel;", "").replace("&parains;", ""))) { gcount++; org.dom4j.Element group = aligned.addElement("group"); group.addAttribute("id", Integer.toString(gcount)); merged_text = trimText(merged_text, true)[0]; org.dom4j.Element merged_src_text = group.addElement("text"); merged_src_text.setText(merged_text.replace("&paradel;", "").replace("&parains;", "")); String[] split_merged_text = merged_text.replaceAll("(&paradel;)+", "&paradel;") .replaceAll("^&paradel;", "").replaceAll("&paradel;$", "").split("&paradel;"); List<String> segmentsGroup = segmentStringWithRevs( merged_text.replaceAll("(&paradel;)+", "&paradel;").replace("&parains;", ""), this.sourcelanguage); List<List<String>> resegmentedGroup = new ArrayList(); resegmentedGroup.add(new ArrayList()); int idx = 0; String orgs; String[] newsegs; for (int s = 0; s < segmentsGroup.size(); s++) { orgs = (String) segmentsGroup.get(s); if (orgs.contains("&paradel;")) { newsegs = orgs.split("&paradel;"); for (int ss = 0; ss < newsegs.length; ss++) { String sss = newsegs[ss]; if (!sss.trim().equals("")) { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(sss)); } if ((((List) resegmentedGroup.get(idx)).size() != 0) && (ss != newsegs.length - 1)) { resegmentedGroup.add(new ArrayList()); idx++; } } if (orgs.trim().endsWith("&paradel;")) { resegmentedGroup.add(new ArrayList()); idx++; } } else { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(orgs)); } } if (split_merged_text.length > resegmentedGroup.size()) { System.out.println(i); System.out.println("merged_text: " + merged_text); for (String smt : split_merged_text) { System.out.println("split_merged_text: " + smt); } for (List<String> smts : resegmentedGroup) { System.out.println("resegmentedGroup: " + smts); } for (String smtss : segmentsGroup) { System.out.println("segmentedGroup: " + smtss); } } for (int j = 0; j < split_merged_text.length; j++) { if (!split_merged_text[j].replaceAll("<(/)*ins>|<(/)*del>", "").trim().equals("")) { split_merged_text[j] = fixMissingTags(split_merged_text[j]); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", Integer.toString(j)); unit.addAttribute("alignsegs", "false"); org.dom4j.Element src = unit.addElement("src_para"); org.dom4j.Element src_text = src.addElement("text"); boolean ishf = split_merged_text[j].contains("&hf;"); if (!ishf) { issrcfirsthf = false; } boolean isAddedPara = split_merged_text[j].contains("&parains;"); src.addAttribute("added", "" + isAddedPara); String[] trim_result = trimText( split_merged_text[j].replace("&parains;", "").replace("&hf;", ""), false); src.addAttribute("lefttrim", trim_result[1]); src.addAttribute("righttrim", trim_result[2]); split_merged_text[j] = trim_result[0]; int src_tctype_para = TrackChangeHelper.getTrackChangeType(split_merged_text[j]); src.addAttribute("tctype", TrackChangeType.getName(src_tctype_para)); String rejected_src = split_merged_text[j].replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if ((!extractionSupportImpl.isExtractable(rejected_src)) || (ishf)) { unit.addAttribute("locked", "true"); } else { unit.addAttribute("locked", "false"); } src_text.setText(split_merged_text[j]); cells.get(cnt, 0).setHtmlString("<html>" + split_merged_text[j].replace("ins>", "u>") .replace("del>", "strike>").replace("<br> ", "↵<br>") + "</html>"); org.dom4j.Element src_segs = src.addElement("segments"); List<String> segments = (List) resegmentedGroup.get(j); for (int z = 0; z < segments.size(); z++) { String segment_text = trimText((String) segments.get(z), false)[0]; org.dom4j.Element src_seg = src_segs.addElement("src_seg"); src_seg.addAttribute("id", Integer.toString(z)); src_seg.addAttribute("needreview", "false"); src_seg.addAttribute("ignored", "false"); int tctype_seg = TrackChangeHelper.getTrackChangeType(segment_text); src_seg.addAttribute("tctype", TrackChangeType.getName(tctype_seg)); String accepted_t = segment_text.replaceAll("(?s)<del>.*?</del>", "") .replace("<ins>", "").replace("</ins>", ""); src_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(accepted_t))); String rejected_s = segment_text.replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if ((!extractionSupportImpl.isExtractable(rejected_s)) || (ishf)) { src_seg.addAttribute("locked", "true"); } else { src_seg.addAttribute("locked", "false"); } src_seg.setText(segment_text); } org.dom4j.Element trg = unit.addElement("trg_para"); if ((src_tctype_para != 1) && (!isAddedPara) && (!ishf) && (trg_para_count < list_target_nonSeg.size())) { trg.addAttribute("id", Integer.toString(gcount) + " - " + Integer.toString(j)); org.dom4j.Element trg_text = trg.addElement("text"); org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg .get(trg_para_count)).element("source"); org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) list_target_seg .get(trg_para_count); while (trg_txlf.getText().contains("&hf;")) { trg_para_count++; trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(trg_para_count)) .element("source"); trg_txlf_seg = (org.dom4j.Element) list_target_seg.get(trg_para_count); } istrgfirsthf = false; String trg_formatted_text = getTxlfElementText_normal(trg_txlf); trg_text.setText(trg_formatted_text.replace("&hf;", "")); cells.get(cnt, 1) .setHtmlString("<html>" + trg_formatted_text.replace("ins>", "u>").replace("del>", "strike>") .replace("&hf;", "").replace("<br> ", "↵<br>") + "</html>"); cnt++; org.dom4j.Element trg_segs = trg.addElement("segments"); List<String> trgsegs = segmentStringWithRevs(trg_formatted_text, this.targetlanguage); List<org.dom4j.Element> srcsegs = src_segs.elements("src_seg"); int trg_tmp_cnt = 0; for (int z = 0; trg_tmp_cnt < trgsegs.size(); z++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(z)); trg_seg.addAttribute("edited", "false"); if ((z < srcsegs.size()) && (((org.dom4j.Element) srcsegs.get(z)) .attributeValue("tctype").equals(TrackChangeType.getName(1)))) { trg_seg.addAttribute("isExtractable", "false"); trg_seg.setText(""); } else { String trgsegtext = ((String) trgsegs.get(trg_tmp_cnt)).replace("&hf;", "") .trim(); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(trgsegtext))); trg_seg.setText(trgsegtext); String mapid = Integer.toString(gcount) + " - " + Integer.toString(j) + " - " + Integer.toString(z); List t = ((org.dom4j.Element) trg_txlf_seg.elements("trans-unit") .get(trg_tmp_cnt)).content(); sbmp.append(mapid + "\t" + trg_para_count + "\t" + trg_tmp_cnt + "\n"); trg_tmp_cnt++; } } trg_para_count++; } else { trg.addAttribute("id", Integer.toString(gcount) + " - " + Integer.toString(j)); org.dom4j.Element trg_text = trg.addElement("text"); trg_text.setText(""); trg.addElement("segments"); cnt++; } int trgcnt = trg.element("segments").elements("trg_seg").size(); int srccnt = src.element("segments").elements("src_seg").size(); if (trgcnt < srccnt) { for (int x = 1; x <= srccnt - trgcnt; x++) { org.dom4j.Element trg_seg = trg.element("segments").addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(trgcnt + x - 1)); trg_seg.addAttribute("edited", "false"); trg_seg.addAttribute("isExtractable", "false"); trg_seg.setText(""); } } } } } } int unitcnt = list_source_formatted_nonSeg.size(); for (int i = trg_para_count; i < list_target_nonSeg.size(); i++) { org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(trg_para_count)) .element("source"); org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) list_target_seg.get(trg_para_count); if (!trg_txlf.getText().contains("&hf;")) { org.dom4j.Element group = aligned.addElement("group"); group.addAttribute("id", Integer.toString(unitcnt)); group.addElement("text").setText(""); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", "0"); unit.addAttribute("alignsegs", "false"); unit.addAttribute("locked", "false"); org.dom4j.Element trg = unit.addElement("trg_para"); trg.addAttribute("id", Integer.toString(unitcnt) + " - 0"); org.dom4j.Element trg_text = trg.addElement("text"); String trg_formatted_text = getTxlfElementText_normal(trg_txlf); trg_text.setText(trg_formatted_text.replace("&hf;", "")); org.dom4j.Element trg_segs = trg.addElement("segments"); List<String> trgsegs = segmentStringWithRevs(trg_formatted_text.replace("&hf;", ""), this.targetlanguage); for (int z = 0; z < trgsegs.size(); z++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(z)); trg_seg.addAttribute("edited", "false"); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable((String) trgsegs.get(z)))); trg_seg.setText(((String) trgsegs.get(z)).trim()); String mapid = Integer.toString(unitcnt) + " - 0 - " + Integer.toString(z); List t = ((org.dom4j.Element) trg_txlf_seg.elements("trans-unit").get(z)).content(); sbmp.append(mapid + "\t" + trg_para_count + "\t" + z + "\n"); } trg_para_count++; unitcnt++; } } wb.save(this.prjfolder + File.separator + "verifySegsPop.xlsx"); OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); OutputStreamWriter writermp = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.reformattedtargetmapfile)), "UTF8"); writermp.write(sbmp.toString()); writermp.close(); return res; }
From source file:revaligner.service.FileAligner.java
public void createAlignedXML_auto(String prjid) throws Exception { System.out.println("creating aligned xml with nbAligner...."); this.alignedfile = (this.prjfolder + File.separator + "rev_aligned.xml"); this.reformattedtargetmapfile = (this.prjfolder + File.separator + "target_reformatted" + File.separator + ".mp"); HashMap<String, String> srcidmap = new HashMap(); this.nbalignerfolder = (this.prjfolder + File.separator + "nbaligner"); if (!new File(this.nbalignerfolder).exists()) { new File(this.nbalignerfolder).mkdir(); }// w w w.ja v a 2s . c o m FileUtils.cleanDirectory(new File(this.nbalignerfolder)); String nbsourcefolder = this.nbalignerfolder + File.separator + this.sourcelanguage; new File(nbsourcefolder).mkdir(); org.dom4j.Document nbsource = DocumentHelper.createDocument(); org.dom4j.Element root_src = nbsource.addElement("txml"); root_src.addAttribute("locale", this.sourcelanguage); root_src.addAttribute("version", "1.0"); root_src.addAttribute("segtype", "sentence"); org.dom4j.Element translatable_src = root_src.addElement("translatable"); translatable_src.addAttribute("blockId", "1"); String nbtargetfolder = this.nbalignerfolder + File.separator + this.targetlanguage; new File(nbtargetfolder).mkdir(); org.dom4j.Document nbtarget = DocumentHelper.createDocument(); org.dom4j.Element root_trg = nbtarget.addElement("txml"); root_trg.addAttribute("locale", this.targetlanguage); root_trg.addAttribute("version", "1.0"); root_trg.addAttribute("segtype", "sentence"); org.dom4j.Element translatable_trg = root_trg.addElement("translatable"); translatable_trg.addAttribute("blockId", "0"); if (new File(this.alignedfile).exists()) { new File(this.alignedfile).delete(); } SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); org.dom4j.Document document = DocumentHelper.createDocument(); org.dom4j.Element root = document.addElement("alinger"); org.dom4j.Element head = root.addElement("head"); head.addAttribute("src_lang", this.sourcelanguage); head.addAttribute("trg_lang", this.targetlanguage); head.addAttribute("creator", this.creatorid); org.dom4j.Element aligned = root.addElement("aligned"); org.dom4j.Element orphans = root.addElement("orphans"); org.dom4j.Document document_source_formatted_nonSeg = XmlParser .parseXmlFile(this.reformattedsourcetxlf_nonSeg); org.dom4j.Element root_source_formatted_nonSeg = document_source_formatted_nonSeg.getRootElement(); List list_source_formatted_nonSeg = root_source_formatted_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_nonSeg = XmlParser.parseXmlFile(this.reformattedtargettxlf_nonSeg); org.dom4j.Element root_target_nonSeg = document_target_nonSeg.getRootElement(); List list_target_nonSeg = root_target_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg); org.dom4j.Element root_target_seg = document_target_seg.getRootElement(); List list_target_seg = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); ExtractionSupportImpl extractionSupportImpl_src = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.sourcelanguage)); Configuration config_src = new BaseConfiguration(); config_src.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl_src.setConfiguration(config_src); ExtractionSupportImpl extractionSupportImpl_trg = new ExtractionSupportImpl( Locale.makeLocale(this.targetlanguage), Locale.makeLocale(this.targetlanguage)); Configuration config_trg = new BaseConfiguration(); config_trg.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl_trg.setConfiguration(config_trg); boolean issrcfirsthf = true; boolean istrgfirsthf = true; int gcount = -1; int segmentId = 0; for (int i = 0; i < list_source_formatted_nonSeg.size(); i++) { org.dom4j.Element src_txlf = ((org.dom4j.Element) list_source_formatted_nonSeg.get(i)) .element("source"); String merged_text = getTxlfElementText_withFakeTC(src_txlf); if (extractionSupportImpl_src .isExtractable(merged_text.replace("&paradel;", "").replace("&parains;", ""))) { gcount++; org.dom4j.Element group = aligned.addElement("group"); group.addAttribute("id", Integer.toString(gcount)); merged_text = trimText(merged_text, true)[0]; org.dom4j.Element merged_src_text = group.addElement("text"); merged_src_text.setText(merged_text.replace("&paradel;", "").replace("&parains;", "")); String[] split_merged_text = merged_text.replaceAll("(&paradel;)+", "&paradel;") .replaceAll("^&paradel;", "").replaceAll("&paradel;$", "").split("&paradel;"); List<String> segmentsGroup = segmentStringWithRevs( merged_text.replaceAll("(&paradel;)+", "&paradel;").replace("&parains;", ""), this.sourcelanguage); List<List<String>> resegmentedGroup = new ArrayList(); resegmentedGroup.add(new ArrayList()); int idx = 0; String orgs; String[] newsegs; for (int s = 0; s < segmentsGroup.size(); s++) { orgs = (String) segmentsGroup.get(s); if (orgs.contains("&paradel;")) { newsegs = orgs.split("&paradel;"); for (int ss = 0; ss < newsegs.length; ss++) { String sss = newsegs[ss]; if (!sss.trim().equals("")) { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(sss)); } if ((((List) resegmentedGroup.get(idx)).size() != 0) && (ss != newsegs.length - 1)) { resegmentedGroup.add(new ArrayList()); idx++; } } if (orgs.trim().endsWith("&paradel;")) { resegmentedGroup.add(new ArrayList()); idx++; } } else { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(orgs)); } } if (split_merged_text.length > resegmentedGroup.size()) { System.out.println(i); System.out.println("merged_text: " + merged_text); for (String smt : split_merged_text) { System.out.println("split_merged_text: " + smt); } for (List<String> smts : resegmentedGroup) { System.out.println("resegmentedGroup: " + smts); } for (String smtss : segmentsGroup) { System.out.println("segmentedGroup: " + smtss); } } for (int j = 0; j < split_merged_text.length; j++) { if (!split_merged_text[j].replaceAll("<(/)*ins>|<(/)*del>", "").trim().equals("")) { split_merged_text[j] = fixMissingTags(split_merged_text[j]); Element unit = group.addElement("unit"); unit.addAttribute("id", Integer.toString(j)); unit.addAttribute("alignsegs", "false"); Element src = unit.addElement("src_para"); org.dom4j.Element src_text = src.addElement("text"); boolean isAddedPara = split_merged_text[j].contains("&parains;"); src.addAttribute("added", "" + isAddedPara); String[] trim_result = trimText(split_merged_text[j].replace("&parains;", ""), false); src.addAttribute("lefttrim", trim_result[1]); src.addAttribute("righttrim", trim_result[2]); split_merged_text[j] = trim_result[0]; int src_tctype_para = TrackChangeHelper.getTrackChangeType(split_merged_text[j]); src.addAttribute("tctype", TrackChangeType.getName(src_tctype_para)); String rejected_src = split_merged_text[j].replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if (!extractionSupportImpl_src.isExtractable(rejected_src)) { unit.addAttribute("locked", "true"); } else { unit.addAttribute("locked", "false"); } src_text.setText(split_merged_text[j]); org.dom4j.Element src_segs = src.addElement("segments"); List<String> segments = (List) resegmentedGroup.get(j); for (int z = 0; z < segments.size(); z++) { String segment_text = trimText((String) segments.get(z), false)[0]; org.dom4j.Element src_seg = src_segs.addElement("src_seg"); src_seg.addAttribute("id", Integer.toString(z)); src_seg.addAttribute("needreview", "false"); src_seg.addAttribute("ignored", "false"); int tctype_seg = TrackChangeHelper.getTrackChangeType(segment_text); src_seg.addAttribute("tctype", TrackChangeType.getName(tctype_seg)); String accepted_t = segment_text.replaceAll("(?s)<del>.*?</del>", "") .replace("<ins>", "").replace("</ins>", ""); src_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl_src.isExtractable(accepted_t))); String rejected_s = segment_text.replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if (!extractionSupportImpl_src.isExtractable(rejected_s)) { src_seg.addAttribute("locked", "true"); } else { src_seg.addAttribute("locked", "false"); org.dom4j.Element segment_src = translatable_src.addElement("segment"); segment_src.addAttribute("segmentId", Integer.toString(segmentId)); srcidmap.put(i + " - " + j + " - " + z, Integer.toString(segmentId)); segmentId++; segment_src.addElement("source").setText(rejected_s); } src_seg.setText(segment_text); } } } } } segmentId = 0; for (int i = 0; i < list_target_nonSeg.size(); i++) { org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(i)).element("source"); String trg_formatted_text = getTxlfElementText_normal(trg_txlf); List<String> trgsegs = segmentStringWithRevs(trg_formatted_text, this.targetlanguage); for (int j = 0; j < trgsegs.size(); j++) { String trgseg = ((String) trgsegs.get(j)).trim().replaceAll("(\\s)+", " "); if (extractionSupportImpl_trg.isExtractable(trgseg)) { org.dom4j.Element segment_trg = translatable_trg.addElement("segment"); segment_trg.addAttribute("segmentId", Integer.toString(segmentId)); segmentId++; segment_trg.addElement("source").setText(trgseg); } } } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream( new FileOutputStream(nbsourcefolder + File.separator + this.sourcelanguage + ".txml")), "UTF8"); nbsource.write(writer); writer.close(); writer = new OutputStreamWriter( new BufferedOutputStream( new FileOutputStream(nbtargetfolder + File.separator + this.targetlanguage + ".txml")), "UTF8"); nbtarget.write(writer); writer.close(); String pahtexe = "\\\\10.2.50.190\\AutoAlignerCLI\\AutoAlignerCLI.exe"; ProcessBuilder pb = new ProcessBuilder( new String[] { pahtexe, "-i", this.nbalignerfolder, "-o", this.nbalignerfolder, "-lang_pairs", this.sourcelanguage + "_" + this.targetlanguage, "-lang_detect", "normal", "-identicals", "-match_filenames", "-txml_or_xmx_output", "-docnames_output", "-disallow_src_merging" }); pb.redirectErrorStream(true); Process p = pb.start(); InputStreamReader isr = new InputStreamReader(p.getInputStream()); BufferedReader br = new BufferedReader(isr); boolean sentESTTime = false; boolean alignstart = false; String lineRead; while ((lineRead = br.readLine()) != null) { System.out.println(lineRead); if (lineRead.contains("Aligning...")) { alignstart = true; } else { if ((lineRead.contains("Estimated Time to Completion:")) && (alignstart)) { this.estimateNBAlignerCompTime = lineRead.replace("Estimated Time to Completion: ", "") .replace(" Minute(s)", ""); } if ((!this.estimateNBAlignerCompTime.equals("")) && (!sentESTTime)) { sentESTTime = true; try { int minutes = 200 + Integer.parseInt(this.estimateNBAlignerCompTime); setAlignProgress(prjid, minutes); this.estimateNBAlignerCompTime = ""; } catch (Exception ex) { ex.printStackTrace(); } } } } p.waitFor(); for (File file : new File(this.nbalignerfolder).listFiles()) { if (file.getName().endsWith(".zip")) { UnzipFile.UnZipIt(file.getAbsolutePath(), this.nbalignerfolder); } } String alignedtxml = ""; for (File file : new File(this.nbalignerfolder).listFiles()) { if (file.getName().endsWith(".txml")) { alignedtxml = file.getAbsolutePath(); } } if (alignedtxml.equals("")) { throw new Exception("file didn't aligned by nbaligner"); } HashMap<String, String[]> alignedtrgs = new HashMap(); List<String[]> missingtrgs = new ArrayList(); int src_idx = -1; org.dom4j.Document alignedtxmldoc = XmlParser.parseXmlFile(alignedtxml); org.dom4j.Element root_alignedtxmldoc = alignedtxmldoc.getRootElement(); for (int i = 0; i < root_alignedtxmldoc.elements("translatable").size(); i++) { org.dom4j.Element translatable = (org.dom4j.Element) root_alignedtxmldoc.elements("translatable") .get(i); for (int j = 0; j < translatable.elements("segment").size(); j++) { org.dom4j.Element segment = (org.dom4j.Element) translatable.elements("segment").get(j); org.dom4j.Element source = segment.element("source"); org.dom4j.Element target = segment.element("target"); if ((source != null) && (!source.getTextTrim().equals(""))) { src_idx++; if ((target != null) && (!target.getTextTrim().equals(""))) { String matchscore = target.attributeValue("score"); int trg_idx = Integer.parseInt(target.attributeValue("sent_no")); if (matchscore.equals("0")) { alignedtrgs.put(Integer.toString(src_idx), new String[] { target.getTextTrim(), "1", Integer.toString(trg_idx) }); } else if (target.attribute("original_segments_count") != null) { int merged_cnt = Integer.parseInt(target.attributeValue("original_segments_count")); String trg_idx_str = Integer.toString(trg_idx) + " - " + Integer.toString(trg_idx + merged_cnt - 1); alignedtrgs.put(Integer.toString(src_idx), new String[] { target.getTextTrim(), matchscore, trg_idx_str }); } else { alignedtrgs.put(Integer.toString(src_idx), new String[] { target.getTextTrim(), matchscore, Integer.toString(trg_idx) }); } } } else if ((target != null) && (!target.getTextTrim().equals(""))) { String matchscore = target.attributeValue("score"); int trg_idx = Integer.parseInt(target.attributeValue("sent_no")); missingtrgs.add(new String[] { target.getTextTrim(), Integer.toString(trg_idx) }); } } } int null_idx = 0; List<org.dom4j.Element> groups = aligned.elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); org.dom4j.Element src_para = unit.element("src_para"); org.dom4j.Element src_para_segs = src_para.element("segments"); org.dom4j.Element trg_para = unit.addElement("trg_para"); org.dom4j.Element trg_para_segs = trg_para.addElement("segments"); List<org.dom4j.Element> src_segs = src_para_segs.elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); org.dom4j.Element trg_seg = trg_para_segs.addElement("trg_seg"); String mapid = Integer.toString(i) + " - " + Integer.toString(j) + " - " + Integer.toString(z); trg_seg.addAttribute("edited", "false"); String trgsegtext = ""; if (srcidmap.containsKey(mapid)) { String sourceidintxml = (String) srcidmap.get(mapid); if (alignedtrgs.containsKey(sourceidintxml)) { src_seg.addAttribute("locked", "true"); trgsegtext = ((String[]) alignedtrgs.get(sourceidintxml))[0]; String score = ((String[]) alignedtrgs.get(sourceidintxml))[1]; String targetidintxml = ((String[]) alignedtrgs.get(sourceidintxml))[2]; if (Integer.parseInt(score) < needreviewthreshhold) { src_seg.addAttribute("needreview", "true"); } trg_seg.addAttribute("id", targetidintxml); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl_trg.isExtractable(trgsegtext))); } else { trg_seg.addAttribute("id", "n - " + null_idx); null_idx++; trg_seg.addAttribute("isExtractable", "false"); } } else { trg_seg.addAttribute("id", "n - " + null_idx); null_idx++; trg_seg.addAttribute("isExtractable", "false"); } trg_seg.setText(trgsegtext); } } } org.dom4j.Element orp_unit = orphans.addElement("unit"); orp_unit.addAttribute("id", "0"); org.dom4j.Element orp_trg_para = orp_unit.addElement("trg_para"); org.dom4j.Element orp_segments = orp_trg_para.addElement("segments"); for (int i = 0; i < missingtrgs.size(); i++) { String orptrgtext = ((String[]) missingtrgs.get(i))[0]; String orptrgid = ((String[]) missingtrgs.get(i))[1]; org.dom4j.Element orp_trg_seg = orp_segments.addElement("trg_seg"); orp_trg_seg.addAttribute("id", orptrgid); orp_trg_seg.addAttribute("edited", "false"); orp_trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl_trg.isExtractable(orptrgtext))); orp_trg_seg.setText(orptrgtext); } OutputStreamWriter oswriter = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(oswriter); oswriter.close(); }
From source file:revaligner.service.FileAligner.java
public void buildTargetContentMap() throws Exception { System.out.println("rebuilding target content map file...."); ExtractionSupportImpl extractionSupportImpl_trg = new ExtractionSupportImpl( Locale.makeLocale(this.targetlanguage), Locale.makeLocale(this.targetlanguage)); Configuration config_trg = new BaseConfiguration(); config_trg.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl_trg.setConfiguration(config_trg); this.txlftrgsegmap = new LinkedHashMap(); this.txlftrgsewsmap = new LinkedHashMap(); org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg); org.dom4j.Element root_target_seg = document_target_seg.getRootElement(); List list_target_para = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); int segmentId = 1; for (int i = 0; i < list_target_para.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) list_target_para.get(i); for (int j = 0; j < group.elements("trans-unit").size(); j++) { org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) group.elements("trans-unit").get(j); String trgseg = trg_txlf_seg.element("source").getText().trim().replaceAll("(\\s)+", " "); if (extractionSupportImpl_trg.isExtractable(trgseg)) { List tmp_content = new ArrayList(); for (int z = 0; z < trg_txlf_seg.content().size(); z++) { if ((trg_txlf_seg.content().get(z) instanceof org.dom4j.Element)) { tmp_content.add(trg_txlf_seg.content().get(z)); }//w w w .j a v a 2 s. c om } this.txlftrgsegmap.put(Integer.valueOf(segmentId), tmp_content); boolean[] seg_attr = { false, false }; if (j == 0) { seg_attr[0] = true; } if (j == group.elements("trans-unit").size() - 1) { seg_attr[1] = true; } this.txlftrgsewsmap.put(Integer.valueOf(segmentId), seg_attr); segmentId++; } } } }
From source file:revaligner.service.FileAligner.java
public boolean verifysegments() throws Exception { System.out.println("verifying segments mapping...."); boolean isValid = false; ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); org.dom4j.Document document_source = XmlParser.parseXmlFile(this.sourcetxlf_seg); org.dom4j.Element root_source = document_source.getRootElement(); List list_source = root_source.selectNodes("//*[name() = 'trans-unit']"); int numberOfPara_source = list_source.size(); List<org.dom4j.Element> text_source = new ArrayList(); Iterator iter_source = list_source.iterator(); while (iter_source.hasNext()) { org.dom4j.Element source = ((org.dom4j.Element) iter_source.next()).element("source"); text_source.add(source);// ww w . j av a2 s.c om } List<String> mergedsegtext = new ArrayList(); org.dom4j.Document alignedxml = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = alignedxml.getRootElement(); List groups = root.selectNodes("//*[name() = 'group']"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List units = group.elements("unit"); ArrayList<String> keys = new ArrayList(); ArrayList<String> key_left = new ArrayList(); ArrayList<String> key_right = new ArrayList(); ArrayList<String> org_keys = new ArrayList(); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < segs.size(); z++) { org.dom4j.Element seg = (org.dom4j.Element) segs.get(z); keys.add(seg.getText().replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", "") .replace("<br>", "").replace("<", "<").replace(">", ">").replace("&", "&") .trim()); org_keys.add(seg.getText()); if ((z == 0) && (z == segs.size() - 1)) { key_left.add(src_para.attributeValue("lefttrim")); key_right.add(src_para.attributeValue("righttrim")); } else if (z == 0) { key_left.add(src_para.attributeValue("lefttrim")); key_right.add("true"); } else if (z == segs.size() - 1) { key_left.add("true"); key_right.add(src_para.attributeValue("righttrim")); } else { key_left.add("true"); key_right.add("true"); } } } } SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); List<String> finsegs = segmenter.segment( group.elementText("text").replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", "") .replace("<br>", "").replace("<", "<").replace(">", ">").replace("&", "&")); ArrayList<ArrayList<Integer>> indices = new ArrayList(); int key_start_index = 0; ArrayList<Integer> indice; for (int k = 0; k < finsegs.size(); k++) { String finsegtext = (String) finsegs.get(k); String combined_key = ""; indice = new ArrayList(); for (int x = key_start_index; x < keys.size(); x++) { combined_key = combined_key + (String) keys.get(x); indice.add(Integer.valueOf(x)); if (combined_key.replace("", " ").trim().replaceAll("(\\s)+", "") .equals(finsegtext.replace("", " ").trim().replaceAll("(\\s)+", ""))) { indices.add(indice); key_start_index = x + 1; break; } } } for (int m = 0; m < indices.size(); m++) { ArrayList<Integer> temp_indice = (ArrayList) indices.get(m); String temp = ""; for (int it : temp_indice) { temp = temp + (String) org_keys.get(it); } mergedsegtext.add(temp); } } List<String> rejectedtexts = new ArrayList(); Workbook wb = new Workbook(); Worksheet ws = wb.getWorksheets().get(0); Cells cells = ws.getCells(); int count = Math.max(text_source.size(), mergedsegtext.size()); int t_count = 0; for (int i = 0; i < count; i++) { String t_src = ""; String t_fom = ""; if (i < text_source.size()) { org.dom4j.Element src = (org.dom4j.Element) text_source.get(i); for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { t_src = t_src + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { t_src = t_src + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) { t_src = t_src + "<br> "; } } } } cells.get(i, 0).setHtmlString(t_src); if (i < mergedsegtext.size()) { t_fom = ((String) mergedsegtext.get(i)).replace("<ins>", "<u>").replace("</ins>", "</u>") .replace("<del>", "<strike>").replace("</del>", "</strike>"); } String accepted_t_fom = t_fom.replaceAll("(?s)<strike>.*?</strike>", "").replace("<u>", "") .replace("</u>", "").replace("&paradel;", "").replace("<br>", ""); String rejected_t_fom = t_fom.replaceAll("(?s)<u>.*?</u>", "").replace("<strike>", "") .replace("</strike>", "").replace("&paradel;", "").replace("<br>", ""); if (extractionSupportImpl.isExtractable(accepted_t_fom)) { String input = "<html>" + t_fom.replace("<br> ", "↵<br>").replace("&paradel;", "<strike>¶</strike><br>") + "</html>"; cells.get(t_count, 1).setHtmlString(input); t_count++; if ((i < mergedsegtext.size()) && (TrackChangeHelper.getTrackChangeType((String) mergedsegtext.get(i)) == 3)) { rejectedtexts.add(rejected_t_fom); } else { rejectedtexts.add(""); } } } wb.save(this.prjfolder + File.separator + "verifySegs.xlsx"); if (numberOfPara_source == t_count) { System.out.println("result: TRUE source: " + numberOfPara_source + " formatted: " + t_count); isValid = true; String timestamp = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'").format(new Date()); for (int r = 0; r < list_source.size(); r++) { if (!((String) rejectedtexts.get(r)).equals("")) { org.dom4j.Element transunit = (org.dom4j.Element) list_source.get(r); org.dom4j.Element originalbase = transunit.addElement("alt-trans"); org.dom4j.Element source = transunit.element("source"); org.dom4j.Element target = transunit.element("target"); source.addAttribute("gs4tr:seginfo", "<root username=\"TC Aligner\" timestamp=\"" + timestamp + "\"/>"); if (target != null) { transunit.elements().add(transunit.elements().indexOf(target) + 1, originalbase.clone()); } else { transunit.elements().add(transunit.elements().indexOf(source) + 1, originalbase.clone()); } transunit.remove(originalbase); org.dom4j.Element original = transunit.element("alt-trans"); original.addAttribute("alttranstype", "x-previous-source-version"); original.addAttribute("gs4tr:seginfo", "<root username=\"Original\" timestamp=\"" + timestamp + "\"/>"); org.dom4j.Element original_source = original.addElement("source"); original_source.addText((String) rejectedtexts.get(r)); original.addElement("target"); } } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.sourcetxlf_seg)), "UTF8"); document_source.write(writer); writer.close(); } else { System.out.println("result: false source: " + numberOfPara_source + " formatted: " + t_count); } return isValid; }
From source file:revaligner.service.FileAligner.java
public void align() throws Exception { org.dom4j.Document document = DocumentHelper.createDocument(); org.dom4j.Element root = document.addElement("alinger"); org.dom4j.Element head = root.addElement("head"); head.addAttribute("src_lang", this.sourcelanguage); head.addAttribute("trg_lang", this.targetlanguage); head.addAttribute("creator", this.creatorid); ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); org.dom4j.Element content = root.addElement("content"); org.dom4j.Element orphans = root.addElement("orphans"); com.aspose.words.Document doc_src = new com.aspose.words.Document(this.sourcefile); com.aspose.words.Document doc_trg = new com.aspose.words.Document(this.targetfile); doc_src.joinRunsWithSameFormatting(); trimParaLeadingTrailingSpace(doc_src); UnlinkFields(doc_src);/* ww w . ja v a 2 s .c om*/ doc_src.save(this.sourcefile + ".docx"); doc_trg.joinRunsWithSameFormatting(); trimParaLeadingTrailingSpace(doc_trg); UnlinkFields(doc_trg); int seqnum = 0; int srcparaindex = 0; int srcparaindex_accept = 0; int trgparaindex = 0; int srcparacnt = doc_src.getChildNodes(8, true).getCount(); int trgparacnt = doc_trg.getChildNodes(8, true).getCount(); boolean ismovefrom = false; boolean ismoveto = false; boolean isprvdelpara = false; int prv = 999999; int unitid = 0; for (int i = 0; i < srcparacnt; i++) { Paragraph para_src = (Paragraph) doc_src.getChildNodes(8, true).get(i); String para_text = getParaText(para_src); boolean isExtractable = extractionSupportImpl.isExtractable(para_text); boolean isNumeric = org.gs4tr.foundation3.core.utils.Text.isNumeric(para_text); if ((!para_text.equals("")) && (isExtractable) && (!isNumeric)) { String src_para_text = ""; boolean hasadds = false; boolean hasdels = false; boolean hasnorm = false; for (int j = 0; j < para_src.getChildNodes(0, true).getCount(); j++) { com.aspose.words.Node node = para_src.getChildNodes(0, true).get(j); if (node.getNodeType() == 13) { ismovefrom = true; } else if (node.getNodeType() == 14) { ismovefrom = false; } else if (node.getNodeType() == 15) { ismoveto = true; } else if (node.getNodeType() == 16) { ismoveto = false; } else if (node.getNodeType() == 21) { Run run = (Run) para_src.getChildNodes(0, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { if ((run.isInsertRevision()) && (!run.isDeleteRevision())) { hasadds = true; src_para_text = src_para_text + "<ins>" + run.getText().replace("<", "<").replace(">", ">") + "</ins>"; } else if (run.isDeleteRevision()) { hasdels = true; src_para_text = src_para_text + "<del>" + run.getText().replace("<", "<").replace(">", ">") + "</del>"; } else if (ismoveto) { hasadds = true; src_para_text = src_para_text + "<ins>" + run.getText().replace("<", "<").replace(">", ">") + "</ins>"; } else if (ismovefrom) { hasdels = true; src_para_text = src_para_text + "<del>" + run.getText().replace("<", "<").replace(">", ">") + "</del>"; } else { hasnorm = true; src_para_text = src_para_text + run.getText().replace("<", "<").replace(">", ">"); } } } } org.dom4j.Element unit; org.dom4j.Element src_para; if ((hasadds) && (!hasdels) && (!hasnorm)) { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "insertion"); } else if ((!hasadds) && (!hasdels) && (hasnorm)) { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "regular"); } else if ((!hasadds) && (hasdels) && (!hasnorm)) { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "deletion"); } else { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "mix"); } src_para.addAttribute("para_seq", Integer.toString(srcparaindex)); src_para.addAttribute("para_seq_acpt", Integer.toString(srcparaindex_accept)); if (prv != srcparaindex_accept) { } prv = srcparaindex_accept; src_para.addText(wordToHtml(src_para_text)); if (((hasdels) || (hasnorm)) && (!isNumeric)) { if (trgparaindex < trgparacnt) { Paragraph para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(trgparaindex); String para_trg_text = getParaText(para_trg); boolean isExtractable_trg = extractionSupportImpl.isExtractable(para_trg_text); if (trgparaindex == trgparacnt - 1) { if ((!para_trg_text.equals("")) && (isExtractable_trg)) { org.dom4j.Element trg_para = unit.addElement("trg_para"); String trg_para_text = ""; for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) { Run run = (Run) para_trg.getChildNodes(21, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { trg_para_text = trg_para_text + run.getText().replace("<", "<").replace(">", ">"); } } trg_para.addAttribute("para_seq", Integer.toString(trgparaindex)); trg_para.addText(wordToHtml(trg_para_text)); } } else { while ((doc_trg.getChildNodes(8, true).get(trgparaindex).getText().trim().equals("")) || (!extractionSupportImpl.isExtractable( doc_trg.getChildNodes(8, true).get(trgparaindex).getText())) || (org.gs4tr.foundation3.core.utils.Text.isNumeric( doc_trg.getChildNodes(8, true).get(trgparaindex).getText().trim()))) { trgparaindex++; if (trgparaindex == trgparacnt - 1) { break; } } para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(trgparaindex); para_trg_text = getParaText(para_trg); if ((!para_trg_text.equals("")) && (extractionSupportImpl.isExtractable(para_trg_text)) && (!org.gs4tr.foundation3.core.utils.Text.isNumeric(para_trg_text))) { org.dom4j.Element trg_para = unit.addElement("trg_para"); String trg_para_text = ""; for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) { Run run = (Run) para_trg.getChildNodes(21, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { trg_para_text = trg_para_text + run.getText().replace("<", "<").replace(">", ">"); } } trg_para.addAttribute("para_seq", Integer.toString(trgparaindex)); trg_para.addText(wordToHtml(trg_para_text)); } } } trgparaindex++; } if (para_src.isDeleteRevision()) { isprvdelpara = true; } else { seqnum++; isprvdelpara = false; } if ((para_src.isEndOfCell()) || (para_src.isEndOfHeaderFooter()) || (para_src.isEndOfSection())) { seqnum++; isprvdelpara = false; } srcparaindex++; if (!isprvdelpara) { srcparaindex_accept++; } } else { for (int j = 0; j < para_src.getChildNodes(0, true).getCount(); j++) { com.aspose.words.Node node = para_src.getChildNodes(0, true).get(j); if (node.getNodeType() == 13) { ismovefrom = true; } else if (node.getNodeType() == 14) { ismovefrom = false; } else if (node.getNodeType() == 15) { ismoveto = true; } else if (node.getNodeType() == 16) { ismoveto = false; } } if (!para_src.isDeleteRevision()) { seqnum++; isprvdelpara = false; srcparaindex_accept++; } srcparaindex++; } } if (trgparaindex < trgparacnt) { for (int i = trgparaindex; i < trgparacnt; i++) { Paragraph para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(i); if ((!para_trg.getText().trim().equals("")) && (extractionSupportImpl.isExtractable(para_trg.getText()))) { org.dom4j.Element unit = content.addElement("unit"); unit.addAttribute("id", Integer.toString(seqnum)); org.dom4j.Element trg_para = unit.addElement("trg_para"); String trg_para_text = ""; for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) { Run run = (Run) para_trg.getChildNodes(21, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { trg_para_text = trg_para_text + run.getText().replace("<", "<").replace(">", ">"); } } trg_para.addAttribute("para_seq", Integer.toString(trgparaindex)); trg_para.addText(wordToHtml(trg_para_text)); seqnum++; trgparaindex++; } } } this.alignedfile = (new File(new File(this.sourcefile).getParent()).getParent() + "/rev_aligned.xml"); if (new File(this.alignedfile).exists()) { new File(this.alignedfile).delete(); } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); }
From source file:revaligner.service.FileAligner.java
public void update(JSONArray arr, JSONArray missings, JSONArray locks, JSONArray segaligned, JSONArray targets, JSONArray missing_targets, int cnt) throws Exception { File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); }//from w w w . j a va 2 s . com ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); List<JSONArray> trg_list = new ArrayList(); for (int i = 0; i < arr.length(); i++) { trg_list.add(arr.getJSONArray(i)); } List<JSONArray> missings_list = new ArrayList(); for (int i = 0; i < missings.length(); i++) { missings_list.add(missings.getJSONArray(i)); } List<String> locks_list = new ArrayList(); for (int i = 0; i < locks.length(); i++) { locks_list.add(locks.getString(i)); } List<String> segaligned_list = new ArrayList(); for (int i = 0; i < segaligned.length(); i++) { segaligned_list.add(segaligned.getString(i)); } this.nullcnt = cnt; int unitcnt = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { unitcnt++; org.dom4j.Element unit = (org.dom4j.Element) units.get(j); String seq = i + " - " + j; if (locks_list.contains(seq)) { unit.attribute("locked").setValue("true"); } else { unit.attribute("locked").setValue("false"); } if (segaligned_list.contains(seq)) { unit.attribute("alignsegs").setValue("true"); } else { unit.attribute("alignsegs").setValue("false"); } if (unitcnt <= trg_list.size()) { JSONArray segs = (JSONArray) trg_list.get(unitcnt - 1); String trg_para_text = targets.getString(unitcnt - 1); org.dom4j.Element trg_para = unit.element("trg_para"); if (trg_para != null) { trg_para.clearContent(); } else { trg_para = unit.addElement("trg_para"); } trg_para.addAttribute("id", segs.getString(0)); org.dom4j.Element trg_text = trg_para.addElement("text"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int s = 1; s < segs.length(); s++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(s - 1)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(segs.getString(s)))); trg_seg.setText(decodehtmlstring(segs.getString(s))); } trg_text.setText(decodehtmlstring(trg_para_text)); } else { units.remove(j); j--; } } if (group.elements("unit").size() == 0) { groups.remove(i); i--; } } org.dom4j.Element orphans = root.element("orphans"); orphans.clearContent(); for (int i = 0; i < missings_list.size(); i++) { JSONArray segs = (JSONArray) missings_list.get(i); String trg_para_text = missing_targets.getString(i); org.dom4j.Element unit = orphans.addElement("unit"); unit.addAttribute("id", Integer.toString(i)); org.dom4j.Element trg_para = unit.addElement("trg_para"); trg_para.addAttribute("id", segs.getString(0)); org.dom4j.Element trg_text = trg_para.addElement("text"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int s = 1; s < segs.length(); s++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(s)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(segs.getString(s)))); trg_seg.setText(decodehtmlstring(segs.getString(s))); } trg_text.setText(decodehtmlstring(trg_para_text)); } new File(this.alignedfile).delete(); OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); }
From source file:revaligner.service.FileAligner.java
public void update_seg(JSONArray targets, JSONArray trg_seqs, JSONArray missing_targets, JSONArray missing_trg_seqs, JSONArray locks, int cnt, JSONArray edited, JSONArray review, JSONArray ignore) throws Exception { File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); }//from w w w . j a v a2 s.c o m ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); List<String> trg_list = new ArrayList(); for (int i = 0; i < targets.length(); i++) { trg_list.add(targets.getString(i)); } List<String> trg_seq_list = new ArrayList(); for (int i = 0; i < trg_seqs.length(); i++) { trg_seq_list.add(trg_seqs.getString(i)); } List<String> missing_list = new ArrayList(); for (int i = 0; i < missing_targets.length(); i++) { missing_list.add(missing_targets.getString(i)); } List<String> missing_seq_list = new ArrayList(); for (int i = 0; i < missing_trg_seqs.length(); i++) { missing_seq_list.add(missing_trg_seqs.getString(i)); } List<String> locks_list = new ArrayList(); for (int i = 0; i < locks.length(); i++) { locks_list.add(locks.getString(i)); } this.nullcnt = cnt; List<String> edited_list = new ArrayList(); for (int i = 0; i < edited.length(); i++) { edited_list.add(edited.getString(i)); } List<String> review_list = new ArrayList(); for (int i = 0; i < review.length(); i++) { review_list.add(review.getString(i)); } List<String> ignore_list = new ArrayList(); for (int i = 0; i < ignore.length(); i++) { ignore_list.add(ignore.getString(i)); } int segcnt = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); unit.addAttribute("alignsegs", "true"); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); String seq = i + " - " + j + " - " + z; if (locks_list.contains(seq)) { src_seg.attribute("locked").setValue("true"); } else { src_seg.attribute("locked").setValue("false"); } if (review_list.contains(seq)) { src_seg.attribute("needreview").setValue("true"); } else { src_seg.attribute("needreview").setValue("false"); } if (ignore_list.contains(seq)) { src_seg.attribute("ignored").setValue("true"); } else { src_seg.attribute("ignored").setValue("false"); } } org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (z >= src_segs.size()) { trg_para.element("segments").remove(trg_seg); } else if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } else { org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } } if (group.elements("unit").size() == 0) { groups.remove(i); i--; } } if (segcnt < trg_list.size()) { org.dom4j.Element group = root.element("aligned").addElement("group"); group.addAttribute("id", Integer.toString(groups.size())); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); trg_para.addAttribute("id", Integer.toString(groups.size()) + " - 0"); org.dom4j.Element trgsegs = trg_para.addElement("segments"); for (int x = segcnt; x < trg_list.size(); x++) { String text = decodehtmlstring((String) trg_list.get(x)); org.dom4j.Element trg_seg = trgsegs.addElement("trg_seg"); trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(x))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(x)))) || (decodehtmlstring((String) trg_seq_list.get(x)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } } org.dom4j.Element orphans = root.element("orphans"); orphans.clearContent(); org.dom4j.Element unit = orphans.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); trg_para.addAttribute("id", "0 - 0"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int i = 0; i < missing_list.size(); i++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); String text = decodehtmlstring((String) missing_list.get(i)); trg_seg.addAttribute("id", decodehtmlstring((String) missing_seq_list.get(i))); if ((edited_list.contains(decodehtmlstring((String) missing_seq_list.get(i)))) || (decodehtmlstring((String) trg_seq_list.get(i)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } new File(this.alignedfile).delete(); OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); if (new File(this.auto_saved_alignedfile).exists()) { new File(this.auto_saved_alignedfile).delete(); } }
From source file:revaligner.service.FileAligner.java
public void auto_update_seg(JSONArray targets, JSONArray trg_seqs, JSONArray missing_targets, JSONArray missing_trg_seqs, JSONArray locks, int cnt, JSONArray edited, JSONArray review, JSONArray ignore) throws Exception { File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); }//w ww.j av a2 s . c o m ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); List<String> trg_list = new ArrayList(); for (int i = 0; i < targets.length(); i++) { trg_list.add(targets.getString(i)); } List<String> trg_seq_list = new ArrayList(); for (int i = 0; i < trg_seqs.length(); i++) { trg_seq_list.add(trg_seqs.getString(i)); } List<String> missing_list = new ArrayList(); for (int i = 0; i < missing_targets.length(); i++) { missing_list.add(missing_targets.getString(i)); } List<String> missing_seq_list = new ArrayList(); for (int i = 0; i < missing_trg_seqs.length(); i++) { missing_seq_list.add(missing_trg_seqs.getString(i)); } List<String> locks_list = new ArrayList(); for (int i = 0; i < locks.length(); i++) { locks_list.add(locks.getString(i)); } this.nullcnt = cnt; List<String> edited_list = new ArrayList(); for (int i = 0; i < edited.length(); i++) { edited_list.add(edited.getString(i)); } List<String> review_list = new ArrayList(); for (int i = 0; i < review.length(); i++) { review_list.add(review.getString(i)); } List<String> ignore_list = new ArrayList(); for (int i = 0; i < ignore.length(); i++) { ignore_list.add(ignore.getString(i)); } int segcnt = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); unit.addAttribute("alignsegs", "true"); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); String seq = i + " - " + j + " - " + z; if (locks_list.contains(seq)) { src_seg.attribute("locked").setValue("true"); } else { src_seg.attribute("locked").setValue("false"); } if (review_list.contains(seq)) { src_seg.attribute("needreview").setValue("true"); } else { src_seg.attribute("needreview").setValue("false"); } if (ignore_list.contains(seq)) { src_seg.attribute("ignored").setValue("true"); } else { src_seg.attribute("ignored").setValue("false"); } } org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (z >= src_segs.size()) { trg_para.element("segments").remove(trg_seg); } else if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } else { org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } } if (group.elements("unit").size() == 0) { groups.remove(i); i--; } } if (segcnt < trg_list.size()) { org.dom4j.Element group = root.element("aligned").addElement("group"); group.addAttribute("id", Integer.toString(groups.size())); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); org.dom4j.Element trgsegs = trg_para.addElement("segments"); for (int x = segcnt; x < trg_list.size(); x++) { String text = decodehtmlstring((String) trg_list.get(x)); org.dom4j.Element trg_seg = trgsegs.addElement("trg_seg"); trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(x))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(x)))) || (decodehtmlstring((String) trg_seq_list.get(x)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } } org.dom4j.Element orphans = root.element("orphans"); orphans.clearContent(); org.dom4j.Element unit = orphans.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int i = 0; i < missing_list.size(); i++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); String text = decodehtmlstring((String) missing_list.get(i)); trg_seg.addAttribute("id", decodehtmlstring((String) missing_seq_list.get(i))); if ((edited_list.contains(decodehtmlstring((String) missing_seq_list.get(i)))) || (decodehtmlstring((String) trg_seq_list.get(i)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } this.auto_saved_alignedfile = (this.alignedfile + ".temp"); if (new File(this.auto_saved_alignedfile).exists()) { new File(this.auto_saved_alignedfile).delete(); } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.auto_saved_alignedfile)), "UTF8"); document.write(writer); writer.close(); }
From source file:revaligner.service.FileAligner.java
public String convertFileToTxlf(String filePath, boolean segmentParagraph, String sourcelanguage) throws Exception { com.aspose.words.Document doc = new com.aspose.words.Document(filePath); doc.joinRunsWithSameFormatting();/*from ww w . j av a 2 s.c o m*/ doc.save(filePath); ArrayList<String> srcs = new ArrayList(); srcs.add(filePath); String orgtxlfname = filePath + ".txlf"; if (new File(orgtxlfname).exists()) { new File(orgtxlfname).delete(); } Locale locale = Locale.makeLocale(sourcelanguage); Configuration config = new BaseConfiguration(); config.setProperty("word.acceptTrackChanges", "true"); config.setProperty("word.extractDropDownList", "false"); config.setProperty("word.extractEquations", "false"); config.setProperty("word.extractComments", "false"); config.setProperty("extraction.tokens.extract", "all"); config.setProperty("word.translateHyperlinkText", "true"); config.setProperty("word.translateHyperlinkValue", "false"); config.setProperty("word.ignoreBiLingualStyles", "true"); ConvertDOC converter = new ConvertDOC(); converter.setConfiguration(config); converter.setIgnoreSuccessfullConversion(true); converter.convert(srcs, locale); segmentTxlf(orgtxlfname, segmentParagraph, sourcelanguage); return orgtxlfname; }
From source file:revaligner.service.FileAligner.java
public boolean verifyParas() throws Exception { System.out.println("verifying paragraphs mapping...."); boolean isValid = false; ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); org.dom4j.Document document_source = XmlParser.parseXmlFile(this.sourcetxlf_nonSeg); org.dom4j.Element root_source = document_source.getRootElement(); org.dom4j.Document document_formatted = XmlParser.parseXmlFile(this.reformattedsourcetxlf_nonSeg); org.dom4j.Element root_formatted = document_formatted.getRootElement(); List list_source = root_source.selectNodes("//*[name() = 'trans-unit']"); int numberOfPara_source = list_source.size(); List list_formatted = root_formatted.selectNodes("//*[name() = 'trans-unit']"); int numberOfPara_formatted = list_formatted.size(); List<org.dom4j.Element> text_source = new ArrayList(); Iterator iter_source = list_source.iterator(); while (iter_source.hasNext()) { org.dom4j.Element source = ((org.dom4j.Element) iter_source.next()).element("source"); text_source.add(source);// www. ja v a 2 s. co m } List<org.dom4j.Element> text_formatted = new ArrayList(); Iterator iter_formatted = list_formatted.iterator(); while (iter_formatted.hasNext()) { org.dom4j.Element source = ((org.dom4j.Element) iter_formatted.next()).element("source"); text_formatted.add(source); } Workbook wb = new Workbook(); Worksheet ws = wb.getWorksheets().get(0); Cells cells = ws.getCells(); int count = Math.max(text_source.size(), text_formatted.size()); int t_count = 0; for (int i = 0; i < count; i++) { String t_src = ""; String t_fom = ""; if (i < text_source.size()) { org.dom4j.Element src = (org.dom4j.Element) text_source.get(i); for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { t_src = t_src + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { t_src = t_src + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) { t_src = t_src + "<br> "; } } } } cells.get(i, 0).setHtmlString("<html>" + t_src.trim().replace("<br> ", "↵<br>") + "</html>"); if (i < text_formatted.size()) { org.dom4j.Element src = (org.dom4j.Element) text_formatted.get(i); ArrayList<String> node_ids = new ArrayList(); for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { t_fom = t_fom + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("bx")) && (e.attribute("ctype").getValue().equals("x-strike-through"))) { t_fom = t_fom + "<strike>"; node_ids.add(e.attribute("rid").getValue()); } else if (e.getName().equals("ex")) { if (node_ids.contains(e.attribute("rid").getValue())) { t_fom = t_fom + "</strike>"; node_ids.remove(e.attribute("rid").getValue()); } } else if ((e.getName().equals("bpt")) && (e.attribute("ctype").getValue().equals("x-underlined")) && (e.getText().contains("type=\"1\""))) { t_fom = t_fom + "<u>"; node_ids.add(e.attribute("rid").getValue()); } else if (e.getName().equals("ept")) { if (node_ids.contains(e.attribute("rid").getValue())) { t_fom = t_fom + "</u>"; node_ids.remove(e.attribute("rid").getValue()); } } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { t_fom = t_fom + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) { t_fom = t_fom + "<br> "; } } } if (!t_fom.contains("<u>")) { if (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']") .size() != 0) { org.dom4j.Node node = (org.dom4j.Node) src .selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']").get(0); if (node.getText().contains("type=\"1\"")) { t_fom = "<u>" + t_fom + "</u>"; } } } else if ((!t_fom.contains("<strike>")) && (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-strike-through'][@pos = 'open']") .size() != 0)) { t_fom = "<strike>" + t_fom + "</strike>"; } } String accepted_t_fom = t_fom.replaceAll("(?s)<strike>.*?</strike>", "").replace("<u>", "") .replace("</u>", "").replace("&paradel;", "").replace("&parains;", "") .replace("&hf;", "").replace("<br>", ""); if (extractionSupportImpl.isExtractable(accepted_t_fom)) { String input = "<html>" + t_fom.replace("&hf;", "").replace("&parains;", "") .replace("<br> ", "↵<br>").replace("&paradel;", "<strike>¶</strike><br>") + "</html>"; cells.get(t_count, 1).setHtmlString(input); t_count++; } } wb.save(this.prjfolder + File.separator + "verifyParas.xlsx"); if (numberOfPara_source == t_count) { System.out.println("result: TRUE source: " + numberOfPara_source + " formatted: " + t_count); isValid = true; } else { System.out.println("result: false source: " + numberOfPara_source + " formatted: " + t_count); } return isValid; }