Example usage for org.apache.commons.io FileUtils listFiles

Introduction

In this page you can find the example usage for org.apache.commons.io FileUtils listFiles.

Prototype

public static Collection listFiles(File directory, String[] extensions, boolean recursive)

Source Link

Document

Finds files within a given directory (and optionally its subdirectories) which match an array of extensions.

Usage

From source file:eu.annocultor.analyzers.SolrPropertyHitsAnalyzer.java

/**
 * @param args/*from ww  w .j ava 2  s .c  o  m*/
 */
public static void main(String[] args) throws Exception {

    String solrUrl = args[0];
    SolrServer solr = new CommonsHttpSolrServer(solrUrl);

    String prefixOne = args[1];
    String prefixTwo = args[2];

    long prefixOneCount = 0;
    long prefixTwoCount = 0;

    long totalPassedCount = 0;

    for (File logLocation : FileUtils.listFiles(new File(args[3]), null, true)) {
        System.out.println("Parsing " + logLocation);

        for (String line : FileUtils.readLines(logLocation)) {
            if (StringUtils.contains(line, "FULL_RESULT_HMTL")) {
                line = StringUtils.substringAfter(line, "europeana_uri=");
                String solrDocumentId = StringUtils.substringBefore(line, ",");
                String query = extractQuery(line);
                if (StringUtils.startsWith(solrDocumentId, "http://") && isLongEnoughToCount(query)) {

                    SolrQuery solrQuery = new SolrQuery("europeana_uri:\"" + solrDocumentId + "\"");
                    QueryResponse response = solr.query(solrQuery);
                    SolrDocumentList sourceDocs = response.getResults();
                    if (sourceDocs.isEmpty()) {
                        System.out.println("Could not find object " + solrDocumentId);
                    } else {
                        SolrDocument document = sourceDocs.get(0);

                        if (hasWord(document, prefixOne, query)) {
                            prefixOneCount++;
                        } else {
                            if (hasWord(document, prefixTwo, query)) {
                                prefixTwoCount++;
                            }
                        }
                    }
                }
                totalPassedCount++;
            }
        }
        System.out.println(prefixOne + " : " + prefixOneCount + " " + prefixTwo + " : " + prefixTwoCount
                + " of total passed entries " + totalPassedCount);
    }
}

From source file:de.tudarmstadt.ukp.csniper.resbuild.stuff.FilterPipe.java

public static void main(String[] args) throws IOException {
    List<String> files = new ArrayList<String>();
    int i = 0;//from   w  ww .ja  v  a 2s  .  c o  m
    for (File file : FileUtils.listFiles(new File(base), new String[] { "csv" }, true)) {
        String text = FileUtils.readFileToString(file, "UTF-8");
        files.add(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml");
        if (StringUtils.containsAny(text, "")) {
            files.remove(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml");
        }
        i++;
        if (i % 100 == 0) {
            System.out.println("ok:" + i);
        }
    }

    FileUtils.writeLines(new File("D:\\hadoop\\output\\BNC_new\\exclusions.txt"), "UTF-8", files);
}

From source file:com.doculibre.constellio.utils.resources.WriteResourceBundleUtils.java

@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
    File binDir = ClasspathUtils.getClassesDir();
    File projectDir = binDir.getParentFile();
    File sourceDir = new File(projectDir, "source");

    String defaultLanguage;//from  ww  w  .  ja  v a2  s  .  co  m
    String otherLanguage;
    if (args.length > 0) {
        defaultLanguage = args[0];
        otherLanguage = args[1];
    } else {
        defaultLanguage = Locale.ENGLISH.getLanguage();
        otherLanguage = Locale.FRENCH.getLanguage();
    }

    List<File> propertiesFiles = (List<File>) FileUtils.listFiles(sourceDir, new String[] { "properties" },
            true);
    for (File propertiesFile : propertiesFiles) {
        File propertiesDir = propertiesFile.getParentFile();

        String propertiesNameWoutSuffix = StringUtils.substringBefore(propertiesFile.getName(), "_");
        propertiesNameWoutSuffix = StringUtils.substringBefore(propertiesNameWoutSuffix, ".properties");

        String noLanguageFileName = propertiesNameWoutSuffix + ".properties";
        String defaultLanguageFileName = propertiesNameWoutSuffix + "_" + defaultLanguage + ".properties";
        String otherLanguageFileName = propertiesNameWoutSuffix + "_" + otherLanguage + ".properties";

        File noLanguageFile = new File(propertiesDir, noLanguageFileName);
        File defaultLanguageFile = new File(propertiesDir, defaultLanguageFileName);
        File otherLanguageFile = new File(propertiesDir, otherLanguageFileName);

        if (defaultLanguageFile.exists() && otherLanguageFile.exists() && !noLanguageFile.exists()) {
            System.out.println(defaultLanguageFile.getPath() + " > " + noLanguageFileName);
            System.out.println(defaultLanguageFile.getPath() + " > empty file");

            defaultLanguageFile.renameTo(noLanguageFile);
            FileWriter defaultLanguageEmptyFileWriter = new FileWriter(defaultLanguageFile);
            defaultLanguageEmptyFileWriter.write("");
            IOUtils.closeQuietly(defaultLanguageEmptyFileWriter);
        }
    }
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step3AddRawDocumentsFromClueWeb.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    // step2a-retrieved-results
    File inputDir = new File(args[0]);

    // warc.bz file containing all required documents according to ClueWeb IDs
    // ltr-50queries-100docs-clueweb-export.warc.gz
    File warc = new File(args[1]);

    // output dir
    File outputDir = new File(args[2]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();//from w w w  .ja va 2 s  . c o m
    }

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        // iterate over warc for each query
        WARCFileReader reader = new WARCFileReader(new Configuration(), new Path(warc.getAbsolutePath()));
        try {
            while (true) {
                WARCRecord read = reader.read();
                String trecId = read.getHeader().getField("WARC-TREC-ID");

                // now iterate over retrieved results for the query and find matching IDs
                for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
                    if (rankedResults.clueWebID.equals(trecId)) {
                        // add the raw html content
                        String fullHTTPResponse = new String(read.getContent(), "utf-8");
                        // TODO fix coding?

                        String html = removeHTTPHeaders(fullHTTPResponse);

                        rankedResults.originalHtml = sanitizeXmlChars(html.trim());
                    }
                }
            }
        } catch (EOFException e) {
            // end of file
        }

        // check if all results have filled html
        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            if (rankedResults.originalHtml == null) {
                System.err.println("Missing original html for\t" + rankedResults.clueWebID
                        + ", setting relevance to false");
                rankedResults.relevant = Boolean.FALSE.toString();
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:com.nuance.expertassistant.ContentCrawler.java

public static void main(String args[]) {
    if (args.length == 0) {
        ContentExtractor.startDocument("Test3", "/Users/abhishek_rohatgi/" + "Test3" + ".xml");
        ContentExtractor.extract("http://www.audihelp.com/auda-147-tyre_repairs.html");
        ContentExtractor.endDocument();/*from w w w  .  j  a  va2  s. co  m*/
    } else {
        final ContentCrawlerOptions options = new ContentCrawlerOptions(args);
        final ContentCrawlerInputTypes inputType = ContentCrawlerOptions.getInputType();
        if (ContentCrawlerInputTypes.FILE.equals(inputType)) {
            final File outputFile = new File(getOutput());
            if (!outputFile.getParentFile().exists()) {
                outputFile.getParentFile().mkdirs();
            }
            translateFile(getInput(), getOutput());
        } else if (ContentCrawlerInputTypes.FOLDER.equals(inputType)) {
            final File outputFolder = new File(getOutput());
            final Collection<File> inputFiles = FileUtils.listFiles(new File(getInput()),
                    new RegexFileFilter("^(.*\\.(html)?)"), DirectoryFileFilter.DIRECTORY);
            for (final File inputFile : inputFiles) {
                final String outputFileName = inputFile.getAbsolutePath().substring(getInput().length())
                        + ".xml";
                final File outputFile = new File(outputFolder, outputFileName);
                if (!outputFile.getParentFile().exists()) {
                    outputFile.getParentFile().mkdirs();
                }
                translateFile(inputFile.getAbsolutePath(), outputFile.getAbsolutePath());
            }

        } else {
            ContentExtractor.startDocument(getInput(), getOutput());
            ContentExtractor.extract(getInput());
            ContentExtractor.endDocument();
        }
    }
}

From source file:com.makkajai.ObjCToCpp.java

/**
 * Main Method//from  w  ww.  j  ava  2 s. com
 *
 * @param args - First argument is the input directory to scan and second is the output directory to write files to.
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

    if (args.length < 2) {
        System.out.println("Invalid Arguments!");
        System.out.println(
                "Usage: java com.makkajai.ObjCToCpp \"<directory to scan for .h and .m files>\" \"<directory to write .h and .cpp files>\"");
        return;
    }

    String inputDirectory = args[0];
    String outputDirectory = args[1];
    //     String inputDirectory = "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/scenes";
    //     String outputDirectory = "/Users/administrator/playground/projarea/monster-math-cross-platform/monster-math-2/Classes/Makkajai/scenes";

    List<String> exceptFiles = new ArrayList<String>();

    if (args.length == 3) {
        BufferedReader bufferedInputStream = new BufferedReader(new FileReader(args[2]));
        String exceptFile = null;
        while ((exceptFile = bufferedInputStream.readLine()) != null) {
            if (exceptFile.equals(""))
                continue;
            exceptFiles.add(exceptFile);
        }
    }

    //Getting all the files from the input directory.
    final List<File> files = new ArrayList<File>(FileUtils.listFiles(new File(inputDirectory),
            new RegexFileFilter(FILE_NAME_WITH_H_OR_M), DirectoryFileFilter.DIRECTORY));

    //        String fileName =
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Utils/MakkajaiEnum"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Utils/MakkajaiUtil"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Home"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Activities/gnumchmenu/PlayStrategy"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Characters/Character"
    //                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Activities/gnumchmenu/GnumchScene"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/ParentScene"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/BaseSkillView"
    ////                "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/YDLayerBase"
    //                ;
    //The instance of the translator.
    ObjCToCppTranslator visitor = new ObjCToCppTranslator();

    for (int i = 0; i < files.size();) {
        File currentFile = files.get(i);
        String filePathRelativeToInput = currentFile.getAbsolutePath().replace(inputDirectory, "");
        Date startTime = new Date();
        try {
            final TranslateFileInput translateFileInput = new TranslateFileInput(inputDirectory,
                    outputDirectory, filePathRelativeToInput, false);
            if (nextFileIsM(currentFile, files, i)) {
                try {
                    if (isIgnoredFile(filePathRelativeToInput, exceptFiles))
                        continue;
                    translateFileInput.dryRun = true;
                    visitor.translateFile(translateFileInput);
                    Date stopTime = new Date();
                    System.out.println("Dry run File: " + translateFileInput.filePathRelativeToInput
                            + " Time Taken: " + getDelta(startTime, stopTime));

                    Date startTime1 = new Date();
                    translateFileInput.filePathRelativeToInput = filePathRelativeToInput.replace(H, M);
                    translateFileInput.dryRun = false;
                    visitor.translateFile(translateFileInput);
                    stopTime = new Date();
                    System.out.println("Processed File: " + translateFileInput.filePathRelativeToInput
                            + " Time Taken: " + getDelta(startTime1, stopTime));

                    Date startTime2 = new Date();
                    translateFileInput.filePathRelativeToInput = filePathRelativeToInput;
                    translateFileInput.dryRun = false;
                    visitor.translateFile(translateFileInput);
                    stopTime = new Date();
                    System.out.println("Processed File: " + translateFileInput.filePathRelativeToInput
                            + " Time Taken: " + getDelta(startTime2, stopTime));
                } catch (Exception e) {
                    e.printStackTrace();
                    System.out.println("###########################Error Processing: " + filePathRelativeToInput
                            + ", Continuing with next set of tiles");
                } finally {
                    i += 2;
                }
                continue;
            }
            if (!isIgnoredFile(filePathRelativeToInput, exceptFiles))
                visitor.translateFile(translateFileInput);
            i++;
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("###########################Error Processing: " + filePathRelativeToInput
                    + ", Continuing with next set of tiles");
        } finally {
            Date stopTime = new Date();
            //                System.out.println("Processed File(s): " + filePathRelativeToInput.replaceAll(H_OR_M, "") + " Time Taken: " + getDelta(startTime, stopTime));
        }
    }
}

From source file:com.github.xbn.examples.io.non_xbn.SizeOrderAllFilesInDirXmpl.java

public static final void main(String[] ignored) {
    File fDir = (new File("R:\\jeffy\\programming\\sandbox\\xbnjava\\xbn\\"));
    Collection<File> cllf = FileUtils.listFiles(fDir, (new String[] { "java" }), true);

    //Add all file paths to a Map, keyed by size.
    //It's actually a map of lists-of-files, to
    //allow multiple files that happen to have the
    //same length.

    TreeMap<Long, List<File>> tmFilesBySize = new TreeMap<Long, List<File>>();
    Iterator<File> itrf = cllf.iterator();
    while (itrf.hasNext()) {
        File f = itrf.next();/*w w w .j a va2s .  co m*/
        Long LLen = f.length();
        if (!tmFilesBySize.containsKey(LLen)) {
            ArrayList<File> alf = new ArrayList<File>();
            alf.add(f);
            tmFilesBySize.put(LLen, alf);
        } else {
            tmFilesBySize.get(LLen).add(f);
        }
    }

    //Iterate backwards by key through the map. For each
    //List<File>, iterate through the files, printing out
    //its size and path.

    ArrayList<Long> alSize = new ArrayList<Long>(tmFilesBySize.keySet());
    for (int i = alSize.size() - 1; i >= 0; i--) {
        itrf = tmFilesBySize.get(alSize.get(i)).iterator();
        while (itrf.hasNext()) {
            File f = itrf.next();
            System.out.println(f.length() + ": " + f.getPath());
        }
    }
}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step4BoilerPlateRemoval.java

public static void main(String[] args) throws IOException {
    // input dir - list of xml query containers
    // step3-filled-raw-html
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();// w ww .  j  a va2  s. c o  m
    }

    // keep original html? (true == default)
    boolean keepOriginalHTML = !(args.length > 2 && "false".equals(args[2]));

    System.out.println(keepOriginalHTML);

    BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            // boilerplate removal

            // there are some empty (corrupted) documents in ClueWeb, namely 0308wb-83.warc.gz
            if (rankedResults.originalHtml != null) {

                rankedResults.plainText = boilerPlateRemoval.getMinimalHtml(rankedResults.originalHtml, null);
            }

            if (!keepOriginalHTML) {
                rankedResults.originalHtml = null;
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}

From source file:de.tudarmstadt.ukp.experiments.argumentation.convincingness.sampling.Step4MTurkOutputCollector.java

@SuppressWarnings("unchecked")
public static void main(String[] args) throws Exception {
    String inputDirWithArgumentPairs = args[0];

    File[] resultFiles;// www .  j  a va  2  s  . c om

    if (args[1].contains("*")) {
        File path = new File(args[1]);
        File directory = path.getParentFile();
        String regex = path.getName().replaceAll("\\*", "");

        List<File> files = new ArrayList<>(FileUtils.listFiles(directory, new String[] { regex }, false));
        resultFiles = new File[files.size()];
        for (int i = 0; i < files.size(); i++) {
            resultFiles[i] = files.get(i);
        }
    } else {
        // result file is a comma-separated list of CSV files from MTurk
        String[] split = args[1].split(",");
        resultFiles = new File[split.length];
        for (int i = 0; i < split.length; i++) {
            resultFiles[i] = new File(split[i]);
        }
    }

    File outputDir = new File(args[2]);

    if (!outputDir.exists()) {
        if (!outputDir.mkdirs()) {
            throw new IOException("Cannot create directory " + outputDir);
        }
    }

    // error if output folder not empty to prevent any confusion by mixing files
    if (!FileUtils.listFiles(outputDir, null, false).isEmpty()) {
        throw new IllegalArgumentException("Output dir " + outputDir + " is not empty");
    }

    // collected assignments with empty reason for rejections
    Set<String> assignmentsWithEmptyReason = new HashSet<>();

    // parse with first line as header
    MTurkOutputReader mTurkOutputReader = new MTurkOutputReader(resultFiles);

    Collection<File> files = FileUtils.listFiles(new File(inputDirWithArgumentPairs), new String[] { "xml" },
            false);

    if (files.isEmpty()) {
        throw new IOException("No xml files found in " + inputDirWithArgumentPairs);
    }

    // statistics: how many hits with how many assignments ; hit ID / assignments
    Map<String, Map<String, Integer>> assignmentsPerHits = new HashMap<>();

    // collect accept/reject statistics
    for (Map<String, String> record : mTurkOutputReader) {
        boolean wasRejected = "Rejected".equals(record.get("assignmentstatus"));
        String hitID = record.get("hitid");
        String hitTypeId = record.get("hittypeid");

        if (!wasRejected) {
            // update statistics
            if (!assignmentsPerHits.containsKey(hitTypeId)) {
                assignmentsPerHits.put(hitTypeId, new HashMap<String, Integer>());
            }

            if (!assignmentsPerHits.get(hitTypeId).containsKey(hitID)) {
                assignmentsPerHits.get(hitTypeId).put(hitID, 0);
            }

            assignmentsPerHits.get(hitTypeId).put(hitID, assignmentsPerHits.get(hitTypeId).get(hitID) + 1);
        }
    }

    // statistics: how many hits with how many assignments ; hit ID / assignments
    Map<String, Integer> approvedAssignmentsPerHit = new HashMap<>();
    Map<String, Integer> rejectedAssignmentsPerHit = new HashMap<>();

    // collect accept/reject statistics
    for (Map<String, String> record : mTurkOutputReader) {
        boolean approved = "Approved".equals(record.get("assignmentstatus"));
        boolean rejected = "Rejected".equals(record.get("assignmentstatus"));
        String hitID = record.get("hitid");

        if (approved) {
            // update statistics
            if (!approvedAssignmentsPerHit.containsKey(hitID)) {
                approvedAssignmentsPerHit.put(hitID, 0);
            }

            approvedAssignmentsPerHit.put(hitID, approvedAssignmentsPerHit.get(hitID) + 1);
        } else if (rejected) {
            // update statistics
            if (!rejectedAssignmentsPerHit.containsKey(hitID)) {
                rejectedAssignmentsPerHit.put(hitID, 0);
            }

            rejectedAssignmentsPerHit.put(hitID, rejectedAssignmentsPerHit.get(hitID) + 1);
        } else {
            throw new IllegalStateException(
                    "Unknown state: " + record.get("assignmentstatus") + " HITID: " + hitID);
        }
    }

    //        System.out.println("Approved: " + approvedAssignmentsPerHit);
    //        System.out.println("Rejected: " + rejectedAssignmentsPerHit);

    System.out.println("Approved (values): " + new HashSet<>(approvedAssignmentsPerHit.values()));
    System.out.println("Rejected (values): " + new HashSet<>(rejectedAssignmentsPerHit.values()));
    // rejection statistics
    int totalRejected = 0;
    for (Map.Entry<String, Integer> rejectionEntry : rejectedAssignmentsPerHit.entrySet()) {
        totalRejected += rejectionEntry.getValue();
    }

    System.out.println("Total rejections: " + totalRejected);

    /*
    // generate .success files for adding more annotations
    for (File resultFile : resultFiles) {
    String hitTypeID = mTurkOutputReader.getHitTypeIdForFile().get(resultFile);
            
    // assignments for that hittypeid (= file)
    Map<String, Integer> assignments = assignmentsPerHits.get(hitTypeID);
            
    prepareUpdateHITsFiles(assignments, hitTypeID, resultFile);
    }
    */

    int totalSavedPairs = 0;

    // load all previously prepared argument pairs
    for (File file : files) {
        List<ArgumentPair> argumentPairs = (List<ArgumentPair>) XStreamTools.getXStream().fromXML(file);

        List<AnnotatedArgumentPair> annotatedArgumentPairs = new ArrayList<>();

        for (ArgumentPair argumentPair : argumentPairs) {
            AnnotatedArgumentPair annotatedArgumentPair = new AnnotatedArgumentPair(argumentPair);

            // is there such an answer?
            String key = "Answer." + argumentPair.getId();

            // iterate only if there is such column to save time
            if (mTurkOutputReader.getColumnNames().contains(key)) {
                // now find the results
                for (Map<String, String> record : mTurkOutputReader) {
                    if (record.containsKey(key)) {
                        // extract the values
                        AnnotatedArgumentPair.MTurkAssignment assignment = new AnnotatedArgumentPair.MTurkAssignment();

                        boolean wasRejected = "Rejected".equals(record.get("assignmentstatus"));

                        // only non-rejected (if required)
                        if (!wasRejected) {
                            String hitID = record.get("hitid");
                            String workerID = record.get("workerid");
                            String assignmentId = record.get("assignmentid");
                            try {
                                assignment.setAssignmentAcceptTime(
                                        DATE_FORMAT.parse(record.get("assignmentaccepttime")));
                                assignment.setAssignmentSubmitTime(
                                        DATE_FORMAT.parse(record.get("assignmentsubmittime")));
                                assignment.setHitComment(record.get("Answer.feedback"));
                                assignment.setHitID(hitID);
                                assignment.setTurkID(workerID);
                                assignment.setAssignmentId(assignmentId);

                                // and answer specific fields
                                String valueRaw = record.get(key);

                                // so far the label has had format aXXX_aYYY_a1, aXXX_aYYY_a2, or aXXX_aYYY_equal
                                // strip now only true label
                                String label = valueRaw.split("_")[2];

                                assignment.setValue(label);
                                String reason = record.get(key + "_reason");

                                // missing reason
                                if (reason == null) {
                                    assignmentsWithEmptyReason.add(assignmentId);
                                } else {
                                    assignment.setReason(reason);

                                    // get worker's stance
                                    String stanceRaw = record.get(key + "_stance");
                                    if (stanceRaw != null) {
                                        // parse stance
                                        String stance = stanceRaw.split("_stance_")[1];
                                        assignment.setWorkerStance(stance);
                                    }

                                    // we take maximal 5 assignments
                                    Collections.sort(annotatedArgumentPair.mTurkAssignments,
                                            new Comparator<AnnotatedArgumentPair.MTurkAssignment>() {
                                                @Override
                                                public int compare(AnnotatedArgumentPair.MTurkAssignment o1,
                                                        AnnotatedArgumentPair.MTurkAssignment o2) {
                                                    return o1.getAssignmentAcceptTime()
                                                            .compareTo(o2.getAssignmentAcceptTime());
                                                }
                                            });

                                    if (annotatedArgumentPair.mTurkAssignments
                                            .size() < MAXIMUM_ASSIGNMENTS_PER_HIT) {
                                        annotatedArgumentPair.mTurkAssignments.add(assignment);
                                    }
                                }
                            } catch (IllegalArgumentException | NullPointerException ex) {
                                System.err.println("Malformed annotations for HIT " + hitID + ", worker "
                                        + workerID + ", assignment " + assignmentId + "; " + ex.getMessage()
                                        + ", full record: " + record);
                            }
                        }
                    }
                }
            }

            // and if there are some annotations, add it to the result set
            if (!annotatedArgumentPair.mTurkAssignments.isEmpty()) {
                annotatedArgumentPairs.add(annotatedArgumentPair);
            }
        }

        if (!annotatedArgumentPairs.isEmpty()) {
            File outputFile = new File(outputDir, file.getName());
            XStreamTools.toXML(annotatedArgumentPairs, outputFile);

            System.out.println("Saved " + annotatedArgumentPairs.size() + " annotated pairs to " + outputFile);
            totalSavedPairs += annotatedArgumentPairs.size();
        }
    }

    System.out.println("Total saved " + totalSavedPairs + " pairs");

    // print assignments with empty reasons
    if (!assignmentsWithEmptyReason.isEmpty()) {
        System.out.println(
                "== Assignments with empty reason:\nassignmentIdToReject\tassignmentIdToRejectComment");
        for (String assignmentId : assignmentsWithEmptyReason) {
            System.out.println(
                    assignmentId + "\t\"Dear worker, you did not fill the required field with a reason.\"");
        }
    }

}

From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java

public static void main(String[] args) throws Exception {
    // input dir - list of xml query containers
    // step4-boiler-plate/
    File inputDir = new File(args[0]);

    // output dir
    File outputDir = new File(args[1]);
    if (!outputDir.exists()) {
        outputDir.mkdirs();//  w w w.  ja  v  a 2  s  .  c  o  m
    }

    // iterate over query containers
    for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) {
        QueryResultContainer queryResultContainer = QueryResultContainer
                .fromXML(FileUtils.readFileToString(f, "utf-8"));

        for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) {
            //                System.out.println(rankedResults.plainText);

            if (rankedResults.plainText != null) {
                String[] lines = StringUtils.split(rankedResults.plainText, "\n");

                // collecting all cleaned lines
                List<String> cleanLines = new ArrayList<>(lines.length);
                // collecting line tags
                List<String> lineTags = new ArrayList<>(lines.length);

                for (String line : lines) {
                    // get the tag
                    String tag = null;
                    Matcher m = OPENING_TAG_PATTERN.matcher(line);

                    if (m.find()) {
                        tag = m.group(1);
                    }

                    if (tag == null) {
                        throw new IllegalArgumentException("No html tag found for line:\n" + line);
                    }

                    // replace the tag at the beginning and the end
                    String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", "");

                    // do some html cleaning
                    noTagText = noTagText.replaceAll("&nbsp;", " ");

                    noTagText = noTagText.trim();

                    // add to the output
                    if (!noTagText.isEmpty()) {
                        cleanLines.add(noTagText);
                        lineTags.add(tag);
                    }
                }

                if (cleanLines.isEmpty()) {
                    // the document is empty
                    System.err.println("Document " + rankedResults.clueWebID + " in query "
                            + queryResultContainer.qID + " is empty");
                } else {
                    // now join them back to paragraphs
                    String text = StringUtils.join(cleanLines, "\n");

                    // create JCas
                    JCas jCas = JCasFactory.createJCas();
                    jCas.setDocumentText(text);
                    jCas.setDocumentLanguage("en");

                    // annotate WebParagraph
                    SimplePipeline.runPipeline(jCas,
                            AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class));

                    // fill the original tag information
                    List<WebParagraph> webParagraphs = new ArrayList<>(
                            JCasUtil.select(jCas, WebParagraph.class));

                    // they must be the same size as original ones
                    if (webParagraphs.size() != lineTags.size()) {
                        throw new IllegalStateException(
                                "Different size of annotated paragraphs and original lines");
                    }

                    for (int i = 0; i < webParagraphs.size(); i++) {
                        WebParagraph p = webParagraphs.get(i);
                        // get tag
                        String tag = lineTags.get(i);

                        p.setOriginalHtmlTag(tag);
                    }

                    SimplePipeline.runPipeline(jCas,
                            AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class,
                                    // only on existing WebParagraph annotations
                                    StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName()));

                    // now convert to XMI
                    ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream();
                    XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream);

                    // encode to base64
                    String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray());

                    rankedResults.originalXmi = encoded;
                }
            }
        }

        // and save the query to output dir
        File outputFile = new File(outputDir, queryResultContainer.qID + ".xml");
        FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8");
        System.out.println("Finished " + outputFile);
    }

}