Example usage for org.apache.commons.io FileUtils listFiles

List of usage examples for org.apache.commons.io FileUtils listFiles

Introduction

In this page you can find the example usage for org.apache.commons.io FileUtils listFiles.

Prototype

public static Collection listFiles(File directory, String[] extensions, boolean recursive) 

Source Link

Document

Finds files within a given directory (and optionally its subdirectories) which match an array of extensions.

Usage

From source file:eu.annocultor.analyzers.SolrPropertyHitsAnalyzer.java

/**
 * @param args/*from ww  w .j ava 2  s .c  o  m*/
 */
public static void main(String[] args) throws Exception {

    String solrUrl = args[0];
    SolrServer solr = new CommonsHttpSolrServer(solrUrl);

    String prefixOne = args[1];
    String prefixTwo = args[2];

    long prefixOneCount = 0;
    long prefixTwoCount = 0;

    long totalPassedCount = 0;

    for (File logLocation : FileUtils.listFiles(new File(args[3]), null, true)) {
        System.out.println("Parsing " + logLocation);

        for (String line : FileUtils.readLines(logLocation)) {
            if (StringUtils.contains(line, "FULL_RESULT_HMTL")) {
                line = StringUtils.substringAfter(line, "europeana_uri=");
                String solrDocumentId = StringUtils.substringBefore(line, ",");
                String query = extractQuery(line);
                if (StringUtils.startsWith(solrDocumentId, "http://") && isLongEnoughToCount(query)) {

                    SolrQuery solrQuery = new SolrQuery("europeana_uri:\"" + solrDocumentId + "\"");
                    QueryResponse response = solr.query(solrQuery);
                    SolrDocumentList sourceDocs = response.getResults();
                    if (sourceDocs.isEmpty()) {
                        System.out.println("Could not find object " + solrDocumentId);
                    } else {
                        SolrDocument document = sourceDocs.get(0);

                        if (hasWord(document, prefixOne, query)) {
                            prefixOneCount++;
                        } else {
                            if (hasWord(document, prefixTwo, query)) {
                                prefixTwoCount++;
                            }
                        }
                    }
                }
                totalPassedCount++;
            }
        }
        System.out.println(prefixOne + " : " + prefixOneCount + " " + prefixTwo + " : " + prefixTwoCount
                + " of total passed entries " + totalPassedCount);
    }
}

From source file:de.tudarmstadt.ukp.csniper.resbuild.stuff.FilterPipe.java

public static void main(String[] args) throws IOException {
    List<String> files = new ArrayList<String>();
    int i = 0;//from   w  ww .ja  v  a 2s  .  c o  m
    for (File file : FileUtils.listFiles(new File(base), new String[] { "csv" }, true)) {
        String text = FileUtils.readFileToString(file, "UTF-8");
        files.add(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml");
        if (StringUtils.containsAny(text, "")) {
            files.remove(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml");
        }
        i++;
        if (i % 100 == 0) {
            System.out.println("ok:" + i);
        }
    }

    FileUtils.writeLines(new File("D:\\hadoop\\output\\BNC_new\\exclusions.txt"), "UTF-8", files);
}

From source file:com.doculibre.constellio.utils.resources.WriteResourceBundleUtils.java

@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
    File binDir = ClasspathUtils.getClassesDir();
    File projectDir = binDir.getParentFile();
    File sourceDir = new File(projectDir, "source");

    String defaultLanguage;//from  ww  w  .  ja  v a2  s  .  co  m
    String otherLanguage;
    if (args.length > 0) {
        defaultLanguage = args[0];
        otherLanguage = args[1];
    } else {
        defaultLanguage = Locale.ENGLISH.getLanguage();
        otherLanguage = Locale.FRENCH.getLanguage();
    }

    List<File> propertiesFiles = (List<File>) FileUtils.listFiles(sourceDir, new String[] { "properties" },
            true);
    for (File propertiesFile : propertiesFiles) {
        File propertiesDir = propertiesFile.getParentFile();

        String propertiesNameWoutSuffix = StringUtils.substringBefore(propertiesFile.getName(), "_");
        propertiesNameWoutSuffix = StringUtils.substringBefore(propertiesNameWoutSuffix, ".properties");

        String noLanguageFileName = propertiesNameWoutSuffix + ".properties";
        String defaultLanguageFileName = propertiesNameWoutSuffix + "_" + defaultLanguage + ".properties";
        String otherLanguageFileName = propertiesNameWoutSuffix + "_" + otherLanguage + ".properties";

        File noLanguageFile = new File(propertiesDir, noLanguageFileName);
        File defaultLanguageFile = new File(propertiesDir, defaultLanguageFileName);
        File otherLanguageFile = new File(propertiesDir, otherLanguageFileName);

        if (defaultLanguageFile.exists() && otherLanguageFile.exists() && !noLanguageFile.exists()) {
            System.out.println(defaultLanguageFile.getPath() + " > " + noLanguageFileName);
            System.out.println(defaultLanguageFile.getPath() + " > empty file");

            defaultLanguageFile.renameTo(noLanguageFile);
            FileWriter defaultLanguageEmptyFileWriter = new FileWriter(defaultLanguageFile);
            defaultLanguageEmptyFileWriter.write("");
            IOUtils.closeQuietly(defaultLanguageEmptyFileWriter);
        }
    }
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step3AddRawDocumentsFromClueWeb.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    // step2a-retrieved-results
    File inputDir = new File(args[0]);

    // warc.bz file containing all required documents according to ClueWeb IDs
    // ltr-50queries-100docs-clueweb-export.warc.gz
    File warc = new File(args[1]);

    // output dir
    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();//from w w w  .ja va 2 s  . c o m
    }

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        // iterate over warc for each query
        WARCFileReader reader = new WARCFileReader(new Configuration(), new Path(warc.getAbsolutePath()));
        try {
            while (true) {
                WARCRecord read = reader.read();
                String trecId = read.getHeader().getField("WARC-TREC-ID");

                // now iterate over retrieved results for the query and find matching IDs
                for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
                    if (rankedResults.clueWebID.equals(trecId)) {
                        // add the raw html content
                        String fullHTTPResponse = new String(read.getContent(), "utf-8");
                        // TODO fix coding?

                        String html = removeHTTPHeaders(fullHTTPResponse);

                        rankedResults.originalHtml = sanitizeXmlChars(html.trim());
                    }
                }
            }
        } catch (EOFException e) {
            // end of file
        }

        // check if all results have filled html
        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            if (rankedResults.originalHtml == null) {
                System.err.println("Missing original html for\t" + rankedResults.clueWebID
                        + ", setting relevance to false");
                rankedResults.relevant = Boolean.FALSE.toString();
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:com.nuance.expertassistant.ContentCrawler.java

public static void main(String args[]) {
    if (args.length == 0) {
        ContentExtractor.startDocument("Test3", "/Users/abhishek_rohatgi/" + "Test3" + ".xml");
        ContentExtractor.extract("http://www.audihelp.com/auda-147-tyre_repairs.html");
        ContentExtractor.endDocument();/*from w w w  .  j  a  va2  s. co  m*/
    } else {
        final ContentCrawlerOptions options = new ContentCrawlerOptions(args);
        final ContentCrawlerInputTypes inputType = ContentCrawlerOptions.getInputType();
        if (ContentCrawlerInputTypes.FILE.equals(inputType)) {
            final File outputFile = new File(getOutput());
            if (!outputFile.getParentFile().exists()) {
                outputFile.getParentFile().mkdirs();
            }
            translateFile(getInput(), getOutput());
        } else if (ContentCrawlerInputTypes.FOLDER.equals(inputType)) {
            final File outputFolder = new File(getOutput());
            final Collection<File> inputFiles = FileUtils.listFiles(new File(getInput()),
                    new RegexFileFilter("^(.*\\.(html)?)"), DirectoryFileFilter.DIRECTORY);
            for (final File inputFile : inputFiles) {
                final String outputFileName = inputFile.getAbsolutePath().substring(getInput().length())
                        + ".xml";
                final File outputFile = new File(outputFolder, outputFileName);
                if (!outputFile.getParentFile().exists()) {
                    outputFile.getParentFile().mkdirs();
                }
                translateFile(inputFile.getAbsolutePath(), outputFile.getAbsolutePath());
            }

        } else {
            ContentExtractor.startDocument(getInput(), getOutput());
            ContentExtractor.extract(getInput());
            ContentExtractor.endDocument();
        }
    }
}

From source file:com.makkajai.ObjCToCpp.java

/**
 * Main Method//from  w  ww.  j  ava  2 s. com
 *
 * @param args - First argument is the input directory to scan and second is the output directory to write files to.
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

    if (args.length < 2) {
        System.out.println("Invalid Arguments!");
        System.out.println(
                "Usage: java com.makkajai.ObjCToCpp \"<directory to scan for .h and .m files>\" \"<directory to write .h and .cpp files>\"");
        return;
    }

    String inputDirectory = args[0];
    String outputDirectory = args[1];
    //     String inputDirectory = "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/scenes";
    //     String outputDirectory = "/Users/administrator/playground/projarea/monster-math-cross-platform/monster-math-2/Classes/Makkajai/scenes";

    List<String> exceptFiles = new ArrayList<String>();

    if (args.length == 3) {
        BufferedReader bufferedInputStream = new BufferedReader(new FileReader(args[2]));
        String exceptFile = null;
        while ((exceptFile = bufferedInputStream.readLine()) != null) {
            if (exceptFile.equals(""))
                continue;
            exceptFiles.add(exceptFile);
        }
    }

    //Getting all the files from the input directory.
    final List<File> files = new ArrayList<File>(FileUtils.listFiles(new File(inputDirectory),
            new RegexFileFilter(FILE_NAME_WITH_H_OR_M), DirectoryFileFilter.DIRECTORY));

    //        String fileName =
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Utils/MakkajaiEnum"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Utils/MakkajaiUtil"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Home"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Activities/gnumchmenu/PlayStrategy"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Characters/Character"
    //                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Activities/gnumchmenu/GnumchScene"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/ParentScene"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/BaseSkillView"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/YDLayerBase"
    //                ;
    //The instance of the translator.
    ObjCToCppTranslator visitor = new ObjCToCppTranslator();

    for (int i = 0; i < files.size();) {
        File currentFile = files.get(i);
        String filePathRelativeToInput = currentFile.getAbsolutePath().replace(inputDirectory, "");
        Date startTime = new Date();
        try {
            final TranslateFileInput translateFileInput = new TranslateFileInput(inputDirectory,
                    outputDirectory, filePathRelativeToInput, false);
            if (nextFileIsM(currentFile, files, i)) {
                try {
                    if (isIgnoredFile(filePathRelativeToInput, exceptFiles))
                        continue;
                    translateFileInput.dryRun = true;
                    visitor.translateFile(translateFileInput);
                    Date stopTime = new Date();
                    System.out.println("Dry run File: " + translateFileInput.filePathRelativeToInput
                            + " Time Taken: " + getDelta(startTime, stopTime));

                    Date startTime1 = new Date();
                    translateFileInput.filePathRelativeToInput = filePathRelativeToInput.replace(H, M);
                    translateFileInput.dryRun = false;
                    visitor.translateFile(translateFileInput);
                    stopTime = new Date();
                    System.out.println("Processed File: " + translateFileInput.filePathRelativeToInput
                            + " Time Taken: " + getDelta(startTime1, stopTime));

                    Date startTime2 = new Date();
                    translateFileInput.filePathRelativeToInput = filePathRelativeToInput;
                    translateFileInput.dryRun = false;
                    visitor.translateFile(translateFileInput);
                    stopTime = new Date();
                    System.out.println("Processed File: " + translateFileInput.filePathRelativeToInput
                            + " Time Taken: " + getDelta(startTime2, stopTime));
                } catch (Exception e) {
                    e.printStackTrace();
                    System.out.println("###########################Error Processing: " + filePathRelativeToInput
                            + ", Continuing with next set of tiles");
                } finally {
                    i += 2;
                }
                continue;
            }
            if (!isIgnoredFile(filePathRelativeToInput, exceptFiles))
                visitor.translateFile(translateFileInput);
            i++;
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("###########################Error Processing: " + filePathRelativeToInput
                    + ", Continuing with next set of tiles");
        } finally {
            Date stopTime = new Date();
            //                System.out.println("Processed File(s): " + filePathRelativeToInput.replaceAll(H_OR_M, "") + " Time Taken: " + getDelta(startTime, stopTime));
        }
    }
}

From source file:com.github.xbn.examples.io.non_xbn.SizeOrderAllFilesInDirXmpl.java

public static final void main(String[] ignored) {
    File fDir = (new File("R:\\jeffy\\programming\\sandbox\\xbnjava\\xbn\\"));
    Collection<File> cllf = FileUtils.listFiles(fDir, (new String[] { "java" }), true);

    //Add all file paths to a Map, keyed by size.
    //It's actually a map of lists-of-files, to
    //allow multiple files that happen to have the
    //same length.

    TreeMap<Long, List<File>> tmFilesBySize = new TreeMap<Long, List<File>>();
    Iterator<File> itrf = cllf.iterator();
    while (itrf.hasNext()) {
        File f = itrf.next();/*w w w .j a va2s .  co m*/
        Long LLen = f.length();
        if (!tmFilesBySize.containsKey(LLen)) {
            ArrayList<File> alf = new ArrayList<File>();
            alf.add(f);
            tmFilesBySize.put(LLen, alf);
        } else {
            tmFilesBySize.get(LLen).add(f);
        }
    }

    //Iterate backwards by key through the map. For each
    //List<File>, iterate through the files, printing out
    //its size and path.

    ArrayList<Long> alSize = new ArrayList<Long>(tmFilesBySize.keySet());
    for (int i = alSize.size() - 1; i >= 0; i--) {
        itrf = tmFilesBySize.get(alSize.get(i)).iterator();
        while (itrf.hasNext()) {
            File f = itrf.next();
            System.out.println(f.length() + ": " + f.getPath());
        }
    }
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step4BoilerPlateRemoval.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    // step3-filled-raw-html
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();// w ww .  j  a va2  s. c o  m
    }

    // keep original html? (true == default)
    boolean keepOriginalHTML = !(args.length > 2 && "false".equals(args[2]));

    System.out.println(keepOriginalHTML);

    BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            // boilerplate removal

            // there are some empty (corrupted) documents in ClueWeb, namely 0308wb-83.warc.gz
            if (rankedResults.originalHtml != null) {

                rankedResults.plainText = boilerPlateRemoval.getMinimalHtml(rankedResults.originalHtml, null);
            }

            if (!keepOriginalHTML) {
                rankedResults.originalHtml = null;
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:de.tudarmstadt.ukp.experiments.argumentation.convincingness.sampling.Step4MTurkOutputCollector.java

@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
    String inputDirWithArgumentPairs = args[0];

    File[] resultFiles;// www .  j  a va  2  s  . c om

    if (args[1].contains("*")) {
        File path = new File(args[1]);
        File directory = path.getParentFile();
        String regex = path.getName().replaceAll("\\*", "");

        List<File> files = new ArrayList<>(FileUtils.listFiles(directory, new String[] { regex }, false));
        resultFiles = new File[files.size()];
        for (int i = 0; i < files.size(); i++) {
            resultFiles[i] = files.get(i);
        }
    } else {
        // result file is a comma-separated list of CSV files from MTurk
        String[] split = args[1].split(",");
        resultFiles = new File[split.length];
        for (int i = 0; i < split.length; i++) {
            resultFiles[i] = new File(split[i]);
        }
    }

    File outputDir = new File(args[2]);

    if (!outputDir.exists()) {
        if (!outputDir.mkdirs()) {
            throw new IOException("Cannot create directory " + outputDir);
        }
    }

    // error if output folder not empty to prevent any confusion by mixing files
    if (!FileUtils.listFiles(outputDir, null, false).isEmpty()) {
        throw new IllegalArgumentException("Output dir " + outputDir + " is not empty");
    }

    // collected assignments with empty reason for rejections
    Set<String> assignmentsWithEmptyReason = new HashSet<>();

    // parse with first line as header
    MTurkOutputReader mTurkOutputReader = new MTurkOutputReader(resultFiles);

    Collection<File> files = FileUtils.listFiles(new File(inputDirWithArgumentPairs), new String[] { "xml" },
            false);

    if (files.isEmpty()) {
        throw new IOException("No xml files found in " + inputDirWithArgumentPairs);
    }

    // statistics: how many hits with how many assignments ; hit ID / assignments
    Map<String, Map<String, Integer>> assignmentsPerHits = new HashMap<>();

    // collect accept/reject statistics
    for (Map<String, String> record : mTurkOutputReader) {
        boolean wasRejected = "Rejected".equals(record.get("assignmentstatus"));
        String hitID = record.get("hitid");
        String hitTypeId = record.get("hittypeid");

        if (!wasRejected) {
            // update statistics
            if (!assignmentsPerHits.containsKey(hitTypeId)) {
                assignmentsPerHits.put(hitTypeId, new HashMap<String, Integer>());
            }

            if (!assignmentsPerHits.get(hitTypeId).containsKey(hitID)) {
                assignmentsPerHits.get(hitTypeId).put(hitID, 0);
            }

            assignmentsPerHits.get(hitTypeId).put(hitID, assignmentsPerHits.get(hitTypeId).get(hitID) + 1);
        }
    }

    // statistics: how many hits with how many assignments ; hit ID / assignments
    Map<String, Integer> approvedAssignmentsPerHit = new HashMap<>();
    Map<String, Integer> rejectedAssignmentsPerHit = new HashMap<>();

    // collect accept/reject statistics
    for (Map<String, String> record : mTurkOutputReader) {
        boolean approved = "Approved".equals(record.get("assignmentstatus"));
        boolean rejected = "Rejected".equals(record.get("assignmentstatus"));
        String hitID = record.get("hitid");

        if (approved) {
            // update statistics
            if (!approvedAssignmentsPerHit.containsKey(hitID)) {
                approvedAssignmentsPerHit.put(hitID, 0);
            }

            approvedAssignmentsPerHit.put(hitID, approvedAssignmentsPerHit.get(hitID) + 1);
        } else if (rejected) {
            // update statistics
            if (!rejectedAssignmentsPerHit.containsKey(hitID)) {
                rejectedAssignmentsPerHit.put(hitID, 0);
            }

            rejectedAssignmentsPerHit.put(hitID, rejectedAssignmentsPerHit.get(hitID) + 1);
        } else {
            throw new IllegalStateException(
                    "Unknown state: " + record.get("assignmentstatus") + " HITID: " + hitID);
        }
    }

    //        System.out.println("Approved: " + approvedAssignmentsPerHit);
    //        System.out.println("Rejected: " + rejectedAssignmentsPerHit);

    System.out.println("Approved (values): " + new HashSet<>(approvedAssignmentsPerHit.values()));
    System.out.println("Rejected (values): " + new HashSet<>(rejectedAssignmentsPerHit.values()));
    // rejection statistics
    int totalRejected = 0;
    for (Map.Entry<String, Integer> rejectionEntry : rejectedAssignmentsPerHit.entrySet()) {
        totalRejected += rejectionEntry.getValue();
    }

    System.out.println("Total rejections: " + totalRejected);

    /*
    // generate .success files for adding more annotations
    for (File resultFile : resultFiles) {
    String hitTypeID = mTurkOutputReader.getHitTypeIdForFile().get(resultFile);
            
    // assignments for that hittypeid (= file)
    Map<String, Integer> assignments = assignmentsPerHits.get(hitTypeID);
            
    prepareUpdateHITsFiles(assignments, hitTypeID, resultFile);
    }
    */

    int totalSavedPairs = 0;

    // load all previously prepared argument pairs
    for (File file : files) {
        List<ArgumentPair> argumentPairs = (List<ArgumentPair>) XStreamTools.getXStream().fromXML(file);

        List<AnnotatedArgumentPair> annotatedArgumentPairs = new ArrayList<>();

        for (ArgumentPair argumentPair : argumentPairs) {
            AnnotatedArgumentPair annotatedArgumentPair = new AnnotatedArgumentPair(argumentPair);

            // is there such an answer?
            String key = "Answer." + argumentPair.getId();

            // iterate only if there is such column to save time
            if (mTurkOutputReader.getColumnNames().contains(key)) {
                // now find the results
                for (Map<String, String> record : mTurkOutputReader) {
                    if (record.containsKey(key)) {
                        // extract the values
                        AnnotatedArgumentPair.MTurkAssignment assignment = new AnnotatedArgumentPair.MTurkAssignment();

                        boolean wasRejected = "Rejected".equals(record.get("assignmentstatus"));

                        // only non-rejected (if required)
                        if (!wasRejected) {
                            String hitID = record.get("hitid");
                            String workerID = record.get("workerid");
                            String assignmentId = record.get("assignmentid");
                            try {
                                assignment.setAssignmentAcceptTime(
                                        DATE_FORMAT.parse(record.get("assignmentaccepttime")));
                                assignment.setAssignmentSubmitTime(
                                        DATE_FORMAT.parse(record.get("assignmentsubmittime")));
                                assignment.setHitComment(record.get("Answer.feedback"));
                                assignment.setHitID(hitID);
                                assignment.setTurkID(workerID);
                                assignment.setAssignmentId(assignmentId);

                                // and answer specific fields
                                String valueRaw = record.get(key);

                                // so far the label has had format aXXX_aYYY_a1, aXXX_aYYY_a2, or aXXX_aYYY_equal
                                // strip now only true label
                                String label = valueRaw.split("_")[2];

                                assignment.setValue(label);
                                String reason = record.get(key + "_reason");

                                // missing reason
                                if (reason == null) {
                                    assignmentsWithEmptyReason.add(assignmentId);
                                } else {
                                    assignment.setReason(reason);

                                    // get worker's stance
                                    String stanceRaw = record.get(key + "_stance");
                                    if (stanceRaw != null) {
                                        // parse stance
                                        String stance = stanceRaw.split("_stance_")[1];
                                        assignment.setWorkerStance(stance);
                                    }

                                    // we take maximal 5 assignments
                                    Collections.sort(annotatedArgumentPair.mTurkAssignments,
                                            new Comparator<AnnotatedArgumentPair.MTurkAssignment>() {
                                                @Override
                                                public int compare(AnnotatedArgumentPair.MTurkAssignment o1,
                                                        AnnotatedArgumentPair.MTurkAssignment o2) {
                                                    return o1.getAssignmentAcceptTime()
                                                            .compareTo(o2.getAssignmentAcceptTime());
                                                }
                                            });

                                    if (annotatedArgumentPair.mTurkAssignments
                                            .size() < MAXIMUM_ASSIGNMENTS_PER_HIT) {
                                        annotatedArgumentPair.mTurkAssignments.add(assignment);
                                    }
                                }
                            } catch (IllegalArgumentException | NullPointerException ex) {
                                System.err.println("Malformed annotations for HIT " + hitID + ", worker "
                                        + workerID + ", assignment " + assignmentId + "; " + ex.getMessage()
                                        + ", full record: " + record);
                            }
                        }
                    }
                }
            }

            // and if there are some annotations, add it to the result set
            if (!annotatedArgumentPair.mTurkAssignments.isEmpty()) {
                annotatedArgumentPairs.add(annotatedArgumentPair);
            }
        }

        if (!annotatedArgumentPairs.isEmpty()) {
            File outputFile = new File(outputDir, file.getName());
            XStreamTools.toXML(annotatedArgumentPairs, outputFile);

            System.out.println("Saved " + annotatedArgumentPairs.size() + " annotated pairs to " + outputFile);
            totalSavedPairs += annotatedArgumentPairs.size();
        }
    }

    System.out.println("Total saved " + totalSavedPairs + " pairs");

    // print assignments with empty reasons
    if (!assignmentsWithEmptyReason.isEmpty()) {
        System.out.println(
                "== Assignments with empty reason:\nassignmentIdToReject\tassignmentIdToRejectComment");
        for (String assignmentId : assignmentsWithEmptyReason) {
            System.out.println(
                    assignmentId + "\t\"Dear worker, you did not fill the required field with a reason.\"");
        }
    }

}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java

public static void main(String[] args) throws Exception {
    // input dir - list of xml query containers
    // step4-boiler-plate/
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();//  w w w.  ja  v  a 2  s  .  c  o  m
    }

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            //                System.out.println(rankedResults.plainText);

            if (rankedResults.plainText != null) {
                String[] lines = StringUtils.split(rankedResults.plainText, "\n");

                // collecting all cleaned lines
                List<String> cleanLines = new ArrayList<>(lines.length);
                // collecting line tags
                List<String> lineTags = new ArrayList<>(lines.length);

                for (String line : lines) {
                    // get the tag
                    String tag = null;
                    Matcher m = OPENING_TAG_PATTERN.matcher(line);

                    if (m.find()) {
                        tag = m.group(1);
                    }

                    if (tag == null) {
                        throw new IllegalArgumentException("No html tag found for line:\n" + line);
                    }

                    // replace the tag at the beginning and the end
                    String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", "");

                    // do some html cleaning
                    noTagText = noTagText.replaceAll("&nbsp;", " ");

                    noTagText = noTagText.trim();

                    // add to the output
                    if (!noTagText.isEmpty()) {
                        cleanLines.add(noTagText);
                        lineTags.add(tag);
                    }
                }

                if (cleanLines.isEmpty()) {
                    // the document is empty
                    System.err.println("Document " + rankedResults.clueWebID + " in query "
                            + queryResultContainer.qID + " is empty");
                } else {
                    // now join them back to paragraphs
                    String text = StringUtils.join(cleanLines, "\n");

                    // create JCas
                    JCas jCas = JCasFactory.createJCas();
                    jCas.setDocumentText(text);
                    jCas.setDocumentLanguage("en");

                    // annotate WebParagraph
                    SimplePipeline.runPipeline(jCas,
                            AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class));

                    // fill the original tag information
                    List<WebParagraph> webParagraphs = new ArrayList<>(
                            JCasUtil.select(jCas, WebParagraph.class));

                    // they must be the same size as original ones
                    if (webParagraphs.size() != lineTags.size()) {
                        throw new IllegalStateException(
                                "Different size of annotated paragraphs and original lines");
                    }

                    for (int i = 0; i < webParagraphs.size(); i++) {
                        WebParagraph p = webParagraphs.get(i);
                        // get tag
                        String tag = lineTags.get(i);

                        p.setOriginalHtmlTag(tag);
                    }

                    SimplePipeline.runPipeline(jCas,
                            AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class,
                                    // only on existing WebParagraph annotations
                                    StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName()));

                    // now convert to XMI
                    ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream();
                    XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream);

                    // encode to base64
                    String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray());

                    rankedResults.originalXmi = encoded;
                }
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}