Example usage for org.apache.commons.io FileUtils readFileToString

List of usage examples for org.apache.commons.io FileUtils readFileToString

Introduction

In this page you can find the example usage for org.apache.commons.io FileUtils readFileToString.

Prototype

public static String readFileToString(File file, String encoding) throws IOException 

Source Link

Document

Reads the contents of a file into a String.

Usage

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step2FillWithRetrievedResults.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    File inputDir = new File(args[0]);

    // retrieved results from Technion
    // ltr-50queries-100docs.txt
    File ltr = new File(args[1]);

    // output dir
    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();/* w  ww  .j  a v a 2  s.c  o  m*/
    }

    // load the query containers first (into map: id + container)
    Map<String, QueryResultContainer> queryResults = new HashMap<>();
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        System.out.println(f);
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));
        queryResults.put(queryResultContainer.qID, queryResultContainer);
    }

    // iterate over IR results
    for (String line : FileUtils.readLines(ltr)) {
        String[] split = line.split("\\s+");
        Integer origQueryId = Integer.valueOf(split[0]);
        String clueWebID = split[2];
        Integer rank = Integer.valueOf(split[3]);
        double score = Double.valueOf(split[4]);
        String additionalInfo = split[5];

        // get the container for this result
        QueryResultContainer container = queryResults.get(origQueryId.toString());

        if (container != null) {
            // add new result
            QueryResultContainer.SingleRankedResult result = new QueryResultContainer.SingleRankedResult();
            result.clueWebID = clueWebID;
            result.rank = rank;
            result.score = score;
            result.additionalInfo = additionalInfo;

            if (container.rankedResults == null) {
                container.rankedResults = new ArrayList<>();
            }
            container.rankedResults.add(result);
        }
    }

    // save all containers to the output dir
    for (QueryResultContainer queryResultContainer : queryResults.values()) {
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }
}

From source file:de.tudarmstadt.ukp.csniper.resbuild.stuff.FilterPipe.java

public static void main(String[] args) throws IOException {
    List<String> files = new ArrayList<String>();
    int i = 0;/*from w  w  w.  j av  a2s  .  c  om*/
    for (File file : FileUtils.listFiles(new File(base), new String[] { "csv" }, true)) {
        String text = FileUtils.readFileToString(file, "UTF-8");
        files.add(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml");
        if (StringUtils.containsAny(text, "")) {
            files.remove(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml");
        }
        i++;
        if (i % 100 == 0) {
            System.out.println("ok:" + i);
        }
    }

    FileUtils.writeLines(new File("D:\\hadoop\\output\\BNC_new\\exclusions.txt"), "UTF-8", files);
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step3AddRawDocumentsFromClueWeb.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    // step2a-retrieved-results
    File inputDir = new File(args[0]);

    // warc.bz file containing all required documents according to ClueWeb IDs
    // ltr-50queries-100docs-clueweb-export.warc.gz
    File warc = new File(args[1]);

    // output dir
    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();//w  w  w.java2  s.com
    }

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        // iterate over warc for each query
        WARCFileReader reader = new WARCFileReader(new Configuration(), new Path(warc.getAbsolutePath()));
        try {
            while (true) {
                WARCRecord read = reader.read();
                String trecId = read.getHeader().getField("WARC-TREC-ID");

                // now iterate over retrieved results for the query and find matching IDs
                for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
                    if (rankedResults.clueWebID.equals(trecId)) {
                        // add the raw html content
                        String fullHTTPResponse = new String(read.getContent(), "utf-8");
                        // TODO fix coding?

                        String html = removeHTTPHeaders(fullHTTPResponse);

                        rankedResults.originalHtml = sanitizeXmlChars(html.trim());
                    }
                }
            }
        } catch (EOFException e) {
            // end of file
        }

        // check if all results have filled html
        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            if (rankedResults.originalHtml == null) {
                System.err.println("Missing original html for\t" + rankedResults.clueWebID
                        + ", setting relevance to false");
                rankedResults.relevant = Boolean.FALSE.toString();
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:com.sonicle.webtop.mail.TestRegexReplace.java

public static void main(String args[]) throws Exception {
    String content = FileUtils.readFileToString(new File("/export/home/gbulfon/content2.txt"), "UTF-8");
    String regex1 = RegexUtils.escapeRegexSpecialChars(
            "service-request?service=com.sonicle.webtop.mail&amp;csrf=iakyng66lcbs277m&amp;action=PreviewAttachment&amp;nowriter=true&amp;uploadId=");
    String regex2 = RegexUtils.escapeRegexSpecialChars("&amp;cid=");
    String replaced = StringUtils.replacePattern(content, regex1 + ".{36}" + regex2, "cid:");
    regex1 = RegexUtils.escapeRegexSpecialChars(
            "service-request?service=com.sonicle.webtop.mail&csrf=iakyng66lcbs277m&action=PreviewAttachment&nowriter=true&uploadId=");
    regex2 = RegexUtils.escapeRegexSpecialChars("&cid=");
    replaced = StringUtils.replacePattern(content, regex1 + ".{36}" + regex2, "cid:");
}

From source file:com.termmed.sampling.ConceptsWithMoreThanThreeRoleGroups.java

/**
 * The main method./*from  ww  w  . j a  v a  2s  .c om*/
 *
 * @param args the arguments
 * @throws Exception the exception
 */
public static void main(String[] args) throws Exception {
    System.out.println("Starting...");
    Map<String, Set<String>> groupsMap = new HashMap<String, Set<String>>();
    File relsFile = new File(
            "/Users/alo/Downloads/SnomedCT_RF2Release_INT_20160131-1/Snapshot/Terminology/sct2_Relationship_Snapshot_INT_20160131.txt");
    BufferedReader br2 = new BufferedReader(new FileReader(relsFile));
    String line2;
    int count2 = 0;
    while ((line2 = br2.readLine()) != null) {
        // process the line.
        count2++;
        if (count2 % 10000 == 0) {
            //System.out.println(count2);
        }
        List<String> columns = Arrays.asList(line2.split("\t", -1));
        if (columns.size() >= 6) {
            if (columns.get(2).equals("1") && !columns.get(6).equals("0")) {
                if (!groupsMap.containsKey(columns.get(4))) {
                    groupsMap.put(columns.get(4), new HashSet<String>());
                }
                groupsMap.get(columns.get(4)).add(columns.get(6));
            }
        }
    }
    System.out.println("Relationship groups loaded");
    Gson gson = new Gson();
    System.out.println("Reading JSON 1");
    File crossoverFile1 = new File("/Users/alo/Downloads/crossover_role_to_group.json");
    String contents = FileUtils.readFileToString(crossoverFile1, "utf-8");
    Type collectionType = new TypeToken<Collection<ControlResultLine>>() {
    }.getType();
    List<ControlResultLine> lineObject = gson.fromJson(contents, collectionType);
    Set<String> crossovers1 = new HashSet<String>();
    for (ControlResultLine loopResult : lineObject) {
        crossovers1.add(loopResult.conceptId);
    }
    System.out.println("Crossovers 1 loaded, " + lineObject.size() + " Objects");

    System.out.println("Reading JSON 2");
    File crossoverFile2 = new File("/Users/alo/Downloads/crossover_group_to_group.json");
    String contents2 = FileUtils.readFileToString(crossoverFile2, "utf-8");
    List<ControlResultLine> lineObject2 = gson.fromJson(contents2, collectionType);
    Set<String> crossovers2 = new HashSet<String>();
    for (ControlResultLine loopResult : lineObject2) {
        crossovers2.add(loopResult.conceptId);
    }
    System.out.println("Crossovers 2 loaded, " + lineObject2.size() + " Objects");

    Set<String> foundConcepts = new HashSet<String>();
    int count3 = 0;
    BufferedWriter writer = new BufferedWriter(
            new FileWriter(new File("ConceptsWithMoreThanThreeRoleGroups.csv")));
    ;
    for (String loopConcept : groupsMap.keySet()) {
        if (groupsMap.get(loopConcept).size() > 3) {
            writer.write(loopConcept);
            writer.newLine();
            foundConcepts.add(loopConcept);
            count3++;
        }
    }
    writer.close();
    System.out.println("Found " + foundConcepts.size() + " concepts");

    int countCrossover1 = 0;
    for (String loopConcept : foundConcepts) {
        if (crossovers1.contains(loopConcept)) {
            countCrossover1++;
        }
    }
    System.out.println(countCrossover1 + " are present in crossover_role_to_group");

    int countCrossover2 = 0;
    for (String loopConcept : foundConcepts) {
        if (crossovers2.contains(loopConcept)) {
            countCrossover2++;
        }
    }
    System.out.println(countCrossover2 + " are present in crossover_group_to_group");

    System.out.println("Done");
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step4BoilerPlateRemoval.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    // step3-filled-raw-html
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();//from w w w  . j  a  va 2 s  . c  o  m
    }

    // keep original html? (true == default)
    boolean keepOriginalHTML = !(args.length > 2 && "false".equals(args[2]));

    System.out.println(keepOriginalHTML);

    BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            // boilerplate removal

            // there are some empty (corrupted) documents in ClueWeb, namely 0308wb-83.warc.gz
            if (rankedResults.originalHtml != null) {

                rankedResults.plainText = boilerPlateRemoval.getMinimalHtml(rankedResults.originalHtml, null);
            }

            if (!keepOriginalHTML) {
                rankedResults.originalHtml = null;
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java

public static void main(String[] args) throws Exception {
    // input dir - list of xml query containers
    // step4-boiler-plate/
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();/*from   ww w  . j a  va 2 s. co  m*/
    }

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            //                System.out.println(rankedResults.plainText);

            if (rankedResults.plainText != null) {
                String[] lines = StringUtils.split(rankedResults.plainText, "\n");

                // collecting all cleaned lines
                List<String> cleanLines = new ArrayList<>(lines.length);
                // collecting line tags
                List<String> lineTags = new ArrayList<>(lines.length);

                for (String line : lines) {
                    // get the tag
                    String tag = null;
                    Matcher m = OPENING_TAG_PATTERN.matcher(line);

                    if (m.find()) {
                        tag = m.group(1);
                    }

                    if (tag == null) {
                        throw new IllegalArgumentException("No html tag found for line:\n" + line);
                    }

                    // replace the tag at the beginning and the end
                    String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", "");

                    // do some html cleaning
                    noTagText = noTagText.replaceAll("&nbsp;", " ");

                    noTagText = noTagText.trim();

                    // add to the output
                    if (!noTagText.isEmpty()) {
                        cleanLines.add(noTagText);
                        lineTags.add(tag);
                    }
                }

                if (cleanLines.isEmpty()) {
                    // the document is empty
                    System.err.println("Document " + rankedResults.clueWebID + " in query "
                            + queryResultContainer.qID + " is empty");
                } else {
                    // now join them back to paragraphs
                    String text = StringUtils.join(cleanLines, "\n");

                    // create JCas
                    JCas jCas = JCasFactory.createJCas();
                    jCas.setDocumentText(text);
                    jCas.setDocumentLanguage("en");

                    // annotate WebParagraph
                    SimplePipeline.runPipeline(jCas,
                            AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class));

                    // fill the original tag information
                    List<WebParagraph> webParagraphs = new ArrayList<>(
                            JCasUtil.select(jCas, WebParagraph.class));

                    // they must be the same size as original ones
                    if (webParagraphs.size() != lineTags.size()) {
                        throw new IllegalStateException(
                                "Different size of annotated paragraphs and original lines");
                    }

                    for (int i = 0; i < webParagraphs.size(); i++) {
                        WebParagraph p = webParagraphs.get(i);
                        // get tag
                        String tag = lineTags.get(i);

                        p.setOriginalHtmlTag(tag);
                    }

                    SimplePipeline.runPipeline(jCas,
                            AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class,
                                    // only on existing WebParagraph annotations
                                    StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName()));

                    // now convert to XMI
                    ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream();
                    XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream);

                    // encode to base64
                    String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray());

                    rankedResults.originalXmi = encoded;
                }
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step10RemoveEmptyDocuments.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();/*w w  w .j  a  v a2 s.com*/
    }

    boolean crop = args.length >= 3 && "crop".equals(args[2]);

    // first find the maximum of zero-sized documents
    int maxMissing = 7;

    /*
    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
    QueryResultContainer queryResultContainer = QueryResultContainer
            .fromXML(FileUtils.readFileToString(f, "utf-8"));
            
    // first find the maximum of zero-sized documents in a query
    int missingInQuery = 0;
            
    for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
        // boilerplate removal
        if (rankedResults.plainText == null || rankedResults.plainText.isEmpty()) {
            missingInQuery++;
        }
    }
            
    maxMissing = Math.max(missingInQuery, maxMissing);
    }
    */

    System.out.println("Max zeroLengthDocuments in query: " + maxMissing);
    // max is 7 = we're cut-off at 93

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        List<QueryResultContainer.SingleRankedResult> nonEmptyDocsList = new ArrayList<>();

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            // collect non-empty documents
            if (rankedResults.plainText != null && !rankedResults.plainText.isEmpty()) {
                nonEmptyDocsList.add(rankedResults);
            }
        }

        System.out.println("Non-empty docs coune: " + nonEmptyDocsList.size());

        if (crop) {
            // now cut at 93
            nonEmptyDocsList = nonEmptyDocsList.subList(0, (100 - maxMissing));
            System.out.println("After cropping: " + nonEmptyDocsList.size());
        }
        System.out.println("After cleaning: " + nonEmptyDocsList.size());

        queryResultContainer.rankedResults.clear();
        queryResultContainer.rankedResults.addAll(nonEmptyDocsList);

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:com.unifil.agendapaf.exemplos.word.DocxToXhtmlAndBack.java

public static void main(String[] args) throws Exception {

    //       String baseURL = "file:///C:/Users/jharrop/git/docx4j-ImportXHTML/images";       
    Docx4jProperties.setProperty("docx4j.Convert.Out.HTML.OutputMethodXML", true);

    try {/* w ww .ja  v a2  s  .  c  o m*/
        getInputFilePath(args);
    } catch (IllegalArgumentException e) {
    }

    System.out.println(inputfilepath);
    WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(new java.io.File(dir + inputfilepath));

    // XHTML export
    AbstractHtmlExporter exporter = new HtmlExporterNG2();
    HtmlSettings htmlSettings = new HtmlSettings();

    htmlSettings.setWmlPackage(wordMLPackage);

    htmlSettings.setImageDirPath(dir + inputfilepath + "_files");
    htmlSettings.setImageTargetUri(dir + inputfilepath + "_files");

    String htmlFilePath = dir + "/DocxToXhtmlAndBack.html";
    OutputStream os = new java.io.FileOutputStream(htmlFilePath);

    //      javax.xml.transform.stream.StreamResult result = new javax.xml.transform.stream.StreamResult(os);
    //      exporter.html(wordMLPackage, result, htmlSettings);
    //      os.flush();
    //      os.close();
    Docx4J.toHTML(htmlSettings, os, Docx4J.FLAG_NONE);

    // XHTML to docx
    String stringFromFile = FileUtils.readFileToString(new File(htmlFilePath), "UTF-8");

    WordprocessingMLPackage docxOut = WordprocessingMLPackage.createPackage();
    NumberingDefinitionsPart ndp = new NumberingDefinitionsPart();
    docxOut.getMainDocumentPart().addTargetPart(ndp);
    ndp.unmarshalDefaultNumbering();

    XHTMLImporterImpl XHTMLImporter = new XHTMLImporterImpl(docxOut);
    XHTMLImporter.setHyperlinkStyle("Hyperlink");

    System.out.println("stringFromFile " + stringFromFile);
    docxOut.getMainDocumentPart().getContent().addAll(XHTMLImporter.convert(stringFromFile, null));

    docxOut.save(new java.io.File(dir + "/DocxToXhtmlAndBack.docx"));

}

From source file:edu.lternet.pasta.common.HTMLUtility.java

public static void main(String[] args) {

    File badIn = new File("/Users/servilla/tmp/bad.txt");
    File goodOut = new File("/Users/servilla/tmp/good.txt");

    String bad = null;/*  w  ww .j ava2  s  .c  o m*/

    try {
        bad = FileUtils.readFileToString(badIn, "UTF-8");
    } catch (IOException e) {
        System.err.println("HTMLUtility: " + e.getMessage());
        e.printStackTrace();
    }

    String good = stripNonValidHTMLCharacters(bad);

    try {
        FileUtils.writeStringToFile(goodOut, good, "UTF-8");
    } catch (IOException e) {
        System.err.println("HTMLUtility: " + e.getMessage());
        e.printStackTrace();
    }

}