List of usage examples for org.apache.commons.io FileUtils readFileToString
public static String readFileToString(File file, String encoding) throws IOException
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step2FillWithRetrievedResults.java
public static void main(String[] args) throws IOException { // input dir - list of xml query containers File inputDir = new File(args[0]); // retrieved results from Technion // ltr-50queries-100docs.txt File ltr = new File(args[1]); // output dir File outputDir = new File(args[2]); if (!outputDir.exists()) { outputDir.mkdirs();/* w ww .j a v a 2 s.c o m*/ } // load the query containers first (into map: id + container) Map<String, QueryResultContainer> queryResults = new HashMap<>(); for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { System.out.println(f); QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); queryResults.put(queryResultContainer.qID, queryResultContainer); } // iterate over IR results for (String line : FileUtils.readLines(ltr)) { String[] split = line.split("\\s+"); Integer origQueryId = Integer.valueOf(split[0]); String clueWebID = split[2]; Integer rank = Integer.valueOf(split[3]); double score = Double.valueOf(split[4]); String additionalInfo = split[5]; // get the container for this result QueryResultContainer container = queryResults.get(origQueryId.toString()); if (container != null) { // add new result QueryResultContainer.SingleRankedResult result = new QueryResultContainer.SingleRankedResult(); result.clueWebID = clueWebID; result.rank = rank; result.score = score; result.additionalInfo = additionalInfo; if (container.rankedResults == null) { container.rankedResults = new ArrayList<>(); } container.rankedResults.add(result); } } // save all containers to the output dir for (QueryResultContainer queryResultContainer : queryResults.values()) { File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:de.tudarmstadt.ukp.csniper.resbuild.stuff.FilterPipe.java
public static void main(String[] args) throws IOException { List<String> files = new ArrayList<String>(); int i = 0;/*from w w w. j av a2s . c om*/ for (File file : FileUtils.listFiles(new File(base), new String[] { "csv" }, true)) { String text = FileUtils.readFileToString(file, "UTF-8"); files.add(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml"); if (StringUtils.containsAny(text, "")) { files.remove(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml"); } i++; if (i % 100 == 0) { System.out.println("ok:" + i); } } FileUtils.writeLines(new File("D:\\hadoop\\output\\BNC_new\\exclusions.txt"), "UTF-8", files); }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step3AddRawDocumentsFromClueWeb.java
public static void main(String[] args) throws IOException { // input dir - list of xml query containers // step2a-retrieved-results File inputDir = new File(args[0]); // warc.bz file containing all required documents according to ClueWeb IDs // ltr-50queries-100docs-clueweb-export.warc.gz File warc = new File(args[1]); // output dir File outputDir = new File(args[2]); if (!outputDir.exists()) { outputDir.mkdirs();//w w w.java2 s.com } // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); // iterate over warc for each query WARCFileReader reader = new WARCFileReader(new Configuration(), new Path(warc.getAbsolutePath())); try { while (true) { WARCRecord read = reader.read(); String trecId = read.getHeader().getField("WARC-TREC-ID"); // now iterate over retrieved results for the query and find matching IDs for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { if (rankedResults.clueWebID.equals(trecId)) { // add the raw html content String fullHTTPResponse = new String(read.getContent(), "utf-8"); // TODO fix coding? String html = removeHTTPHeaders(fullHTTPResponse); rankedResults.originalHtml = sanitizeXmlChars(html.trim()); } } } } catch (EOFException e) { // end of file } // check if all results have filled html for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { if (rankedResults.originalHtml == null) { System.err.println("Missing original html for\t" + rankedResults.clueWebID + ", setting relevance to false"); rankedResults.relevant = Boolean.FALSE.toString(); } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:com.sonicle.webtop.mail.TestRegexReplace.java
public static void main(String args[]) throws Exception { String content = FileUtils.readFileToString(new File("/export/home/gbulfon/content2.txt"), "UTF-8"); String regex1 = RegexUtils.escapeRegexSpecialChars( "service-request?service=com.sonicle.webtop.mail&csrf=iakyng66lcbs277m&action=PreviewAttachment&nowriter=true&uploadId="); String regex2 = RegexUtils.escapeRegexSpecialChars("&cid="); String replaced = StringUtils.replacePattern(content, regex1 + ".{36}" + regex2, "cid:"); regex1 = RegexUtils.escapeRegexSpecialChars( "service-request?service=com.sonicle.webtop.mail&csrf=iakyng66lcbs277m&action=PreviewAttachment&nowriter=true&uploadId="); regex2 = RegexUtils.escapeRegexSpecialChars("&cid="); replaced = StringUtils.replacePattern(content, regex1 + ".{36}" + regex2, "cid:"); }
From source file:com.termmed.sampling.ConceptsWithMoreThanThreeRoleGroups.java
/** * The main method./*from ww w . j a v a 2s .c om*/ * * @param args the arguments * @throws Exception the exception */ public static void main(String[] args) throws Exception { System.out.println("Starting..."); Map<String, Set<String>> groupsMap = new HashMap<String, Set<String>>(); File relsFile = new File( "/Users/alo/Downloads/SnomedCT_RF2Release_INT_20160131-1/Snapshot/Terminology/sct2_Relationship_Snapshot_INT_20160131.txt"); BufferedReader br2 = new BufferedReader(new FileReader(relsFile)); String line2; int count2 = 0; while ((line2 = br2.readLine()) != null) { // process the line. count2++; if (count2 % 10000 == 0) { //System.out.println(count2); } List<String> columns = Arrays.asList(line2.split("\t", -1)); if (columns.size() >= 6) { if (columns.get(2).equals("1") && !columns.get(6).equals("0")) { if (!groupsMap.containsKey(columns.get(4))) { groupsMap.put(columns.get(4), new HashSet<String>()); } groupsMap.get(columns.get(4)).add(columns.get(6)); } } } System.out.println("Relationship groups loaded"); Gson gson = new Gson(); System.out.println("Reading JSON 1"); File crossoverFile1 = new File("/Users/alo/Downloads/crossover_role_to_group.json"); String contents = FileUtils.readFileToString(crossoverFile1, "utf-8"); Type collectionType = new TypeToken<Collection<ControlResultLine>>() { }.getType(); List<ControlResultLine> lineObject = gson.fromJson(contents, collectionType); Set<String> crossovers1 = new HashSet<String>(); for (ControlResultLine loopResult : lineObject) { crossovers1.add(loopResult.conceptId); } System.out.println("Crossovers 1 loaded, " + lineObject.size() + " Objects"); System.out.println("Reading JSON 2"); File crossoverFile2 = new File("/Users/alo/Downloads/crossover_group_to_group.json"); String contents2 = FileUtils.readFileToString(crossoverFile2, "utf-8"); List<ControlResultLine> lineObject2 = gson.fromJson(contents2, collectionType); Set<String> crossovers2 = new HashSet<String>(); for (ControlResultLine loopResult : lineObject2) { crossovers2.add(loopResult.conceptId); } System.out.println("Crossovers 2 loaded, " + lineObject2.size() + " Objects"); Set<String> foundConcepts = new HashSet<String>(); int count3 = 0; BufferedWriter writer = new BufferedWriter( new FileWriter(new File("ConceptsWithMoreThanThreeRoleGroups.csv"))); ; for (String loopConcept : groupsMap.keySet()) { if (groupsMap.get(loopConcept).size() > 3) { writer.write(loopConcept); writer.newLine(); foundConcepts.add(loopConcept); count3++; } } writer.close(); System.out.println("Found " + foundConcepts.size() + " concepts"); int countCrossover1 = 0; for (String loopConcept : foundConcepts) { if (crossovers1.contains(loopConcept)) { countCrossover1++; } } System.out.println(countCrossover1 + " are present in crossover_role_to_group"); int countCrossover2 = 0; for (String loopConcept : foundConcepts) { if (crossovers2.contains(loopConcept)) { countCrossover2++; } } System.out.println(countCrossover2 + " are present in crossover_group_to_group"); System.out.println("Done"); }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step4BoilerPlateRemoval.java
public static void main(String[] args) throws IOException { // input dir - list of xml query containers // step3-filled-raw-html File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();//from w w w . j a va 2 s . c o m } // keep original html? (true == default) boolean keepOriginalHTML = !(args.length > 2 && "false".equals(args[2])); System.out.println(keepOriginalHTML); BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval(); // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // boilerplate removal // there are some empty (corrupted) documents in ClueWeb, namely 0308wb-83.warc.gz if (rankedResults.originalHtml != null) { rankedResults.plainText = boilerPlateRemoval.getMinimalHtml(rankedResults.originalHtml, null); } if (!keepOriginalHTML) { rankedResults.originalHtml = null; } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java
public static void main(String[] args) throws Exception { // input dir - list of xml query containers // step4-boiler-plate/ File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();/*from ww w . j a va 2 s. co m*/ } // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // System.out.println(rankedResults.plainText); if (rankedResults.plainText != null) { String[] lines = StringUtils.split(rankedResults.plainText, "\n"); // collecting all cleaned lines List<String> cleanLines = new ArrayList<>(lines.length); // collecting line tags List<String> lineTags = new ArrayList<>(lines.length); for (String line : lines) { // get the tag String tag = null; Matcher m = OPENING_TAG_PATTERN.matcher(line); if (m.find()) { tag = m.group(1); } if (tag == null) { throw new IllegalArgumentException("No html tag found for line:\n" + line); } // replace the tag at the beginning and the end String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", ""); // do some html cleaning noTagText = noTagText.replaceAll(" ", " "); noTagText = noTagText.trim(); // add to the output if (!noTagText.isEmpty()) { cleanLines.add(noTagText); lineTags.add(tag); } } if (cleanLines.isEmpty()) { // the document is empty System.err.println("Document " + rankedResults.clueWebID + " in query " + queryResultContainer.qID + " is empty"); } else { // now join them back to paragraphs String text = StringUtils.join(cleanLines, "\n"); // create JCas JCas jCas = JCasFactory.createJCas(); jCas.setDocumentText(text); jCas.setDocumentLanguage("en"); // annotate WebParagraph SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class)); // fill the original tag information List<WebParagraph> webParagraphs = new ArrayList<>( JCasUtil.select(jCas, WebParagraph.class)); // they must be the same size as original ones if (webParagraphs.size() != lineTags.size()) { throw new IllegalStateException( "Different size of annotated paragraphs and original lines"); } for (int i = 0; i < webParagraphs.size(); i++) { WebParagraph p = webParagraphs.get(i); // get tag String tag = lineTags.get(i); p.setOriginalHtmlTag(tag); } SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class, // only on existing WebParagraph annotations StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName())); // now convert to XMI ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream(); XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream); // encode to base64 String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray()); rankedResults.originalXmi = encoded; } } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step10RemoveEmptyDocuments.java
public static void main(String[] args) throws IOException { // input dir - list of xml query containers File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();/*w w w .j a v a2 s.com*/ } boolean crop = args.length >= 3 && "crop".equals(args[2]); // first find the maximum of zero-sized documents int maxMissing = 7; /* // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); // first find the maximum of zero-sized documents in a query int missingInQuery = 0; for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // boilerplate removal if (rankedResults.plainText == null || rankedResults.plainText.isEmpty()) { missingInQuery++; } } maxMissing = Math.max(missingInQuery, maxMissing); } */ System.out.println("Max zeroLengthDocuments in query: " + maxMissing); // max is 7 = we're cut-off at 93 // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); List<QueryResultContainer.SingleRankedResult> nonEmptyDocsList = new ArrayList<>(); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // collect non-empty documents if (rankedResults.plainText != null && !rankedResults.plainText.isEmpty()) { nonEmptyDocsList.add(rankedResults); } } System.out.println("Non-empty docs coune: " + nonEmptyDocsList.size()); if (crop) { // now cut at 93 nonEmptyDocsList = nonEmptyDocsList.subList(0, (100 - maxMissing)); System.out.println("After cropping: " + nonEmptyDocsList.size()); } System.out.println("After cleaning: " + nonEmptyDocsList.size()); queryResultContainer.rankedResults.clear(); queryResultContainer.rankedResults.addAll(nonEmptyDocsList); // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:com.unifil.agendapaf.exemplos.word.DocxToXhtmlAndBack.java
public static void main(String[] args) throws Exception { // String baseURL = "file:///C:/Users/jharrop/git/docx4j-ImportXHTML/images"; Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true); try {/* w ww .ja v a2 s . c o m*/ getInputFilePath(args); } catch (IllegalArgumentException e) { } System.out.println(inputfilepath); WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new java.io.File(dir + inputfilepath)); // XHTML export AbstractHtmlExporter exporter = new HtmlExporterNG2(); HtmlSettings htmlSettings = new HtmlSettings(); htmlSettings.setWmlPackage(wordMLPackage); htmlSettings.setImageDirPath(dir + inputfilepath + "_files"); htmlSettings.setImageTargetUri(dir + inputfilepath + "_files"); String htmlFilePath = dir + "/DocxToXhtmlAndBack.html"; OutputStream os = new java.io.FileOutputStream(htmlFilePath); // javax.xml.transform.stream.StreamResult result = new javax.xml.transform.stream.StreamResult(os); // exporter.html(wordMLPackage, result, htmlSettings); // os.flush(); // os.close(); Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_NONE); // XHTML to docx String stringFromFile = FileUtils.readFileToString(new File(htmlFilePath), "UTF-8"); WordprocessingMLPackage docxOut = WordprocessingMLPackage.createPackage(); NumberingDefinitionsPart ndp = new NumberingDefinitionsPart(); docxOut.getMainDocumentPart().addTargetPart(ndp); ndp.unmarshalDefaultNumbering(); XHTMLImporterImpl XHTMLImporter = new XHTMLImporterImpl(docxOut); XHTMLImporter.setHyperlinkStyle("Hyperlink"); System.out.println("stringFromFile " + stringFromFile); docxOut.getMainDocumentPart().getContent().addAll(XHTMLImporter.convert(stringFromFile, null)); docxOut.save(new java.io.File(dir + "/DocxToXhtmlAndBack.docx")); }
From source file:edu.lternet.pasta.common.HTMLUtility.java
public static void main(String[] args) { File badIn = new File("/Users/servilla/tmp/bad.txt"); File goodOut = new File("/Users/servilla/tmp/good.txt"); String bad = null;/* w ww .j ava2 s .c o m*/ try { bad = FileUtils.readFileToString(badIn, "UTF-8"); } catch (IOException e) { System.err.println("HTMLUtility: " + e.getMessage()); e.printStackTrace(); } String good = stripNonValidHTMLCharacters(bad); try { FileUtils.writeStringToFile(goodOut, good, "UTF-8"); } catch (IOException e) { System.err.println("HTMLUtility: " + e.getMessage()); e.printStackTrace(); } }