List of usage examples for org.apache.commons.io FileUtils listFiles
public static Collection listFiles(File directory, String[] extensions, boolean recursive)
From source file:eu.annocultor.analyzers.SolrPropertyHitsAnalyzer.java
/** * @param args/*from ww w .j ava 2 s .c o m*/ */ public static void main(String[] args) throws Exception { String solrUrl = args[0]; SolrServer solr = new CommonsHttpSolrServer(solrUrl); String prefixOne = args[1]; String prefixTwo = args[2]; long prefixOneCount = 0; long prefixTwoCount = 0; long totalPassedCount = 0; for (File logLocation : FileUtils.listFiles(new File(args[3]), null, true)) { System.out.println("Parsing " + logLocation); for (String line : FileUtils.readLines(logLocation)) { if (StringUtils.contains(line, "FULL_RESULT_HMTL")) { line = StringUtils.substringAfter(line, "europeana_uri="); String solrDocumentId = StringUtils.substringBefore(line, ","); String query = extractQuery(line); if (StringUtils.startsWith(solrDocumentId, "http://") && isLongEnoughToCount(query)) { SolrQuery solrQuery = new SolrQuery("europeana_uri:\"" + solrDocumentId + "\""); QueryResponse response = solr.query(solrQuery); SolrDocumentList sourceDocs = response.getResults(); if (sourceDocs.isEmpty()) { System.out.println("Could not find object " + solrDocumentId); } else { SolrDocument document = sourceDocs.get(0); if (hasWord(document, prefixOne, query)) { prefixOneCount++; } else { if (hasWord(document, prefixTwo, query)) { prefixTwoCount++; } } } } totalPassedCount++; } } System.out.println(prefixOne + " : " + prefixOneCount + " " + prefixTwo + " : " + prefixTwoCount + " of total passed entries " + totalPassedCount); } }
From source file:de.tudarmstadt.ukp.csniper.resbuild.stuff.FilterPipe.java
public static void main(String[] args) throws IOException { List<String> files = new ArrayList<String>(); int i = 0;//from w ww .ja v a 2s . c o m for (File file : FileUtils.listFiles(new File(base), new String[] { "csv" }, true)) { String text = FileUtils.readFileToString(file, "UTF-8"); files.add(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml"); if (StringUtils.containsAny(text, "")) { files.remove(StringUtils.substringBeforeLast(file.getName(), ".") + ".xml"); } i++; if (i % 100 == 0) { System.out.println("ok:" + i); } } FileUtils.writeLines(new File("D:\\hadoop\\output\\BNC_new\\exclusions.txt"), "UTF-8", files); }
From source file:com.doculibre.constellio.utils.resources.WriteResourceBundleUtils.java
@SuppressWarnings("unchecked") public static void main(String[] args) throws Exception { File binDir = ClasspathUtils.getClassesDir(); File projectDir = binDir.getParentFile(); File sourceDir = new File(projectDir, "source"); String defaultLanguage;//from ww w . ja v a2 s . co m String otherLanguage; if (args.length > 0) { defaultLanguage = args[0]; otherLanguage = args[1]; } else { defaultLanguage = Locale.ENGLISH.getLanguage(); otherLanguage = Locale.FRENCH.getLanguage(); } List<File> propertiesFiles = (List<File>) FileUtils.listFiles(sourceDir, new String[] { "properties" }, true); for (File propertiesFile : propertiesFiles) { File propertiesDir = propertiesFile.getParentFile(); String propertiesNameWoutSuffix = StringUtils.substringBefore(propertiesFile.getName(), "_"); propertiesNameWoutSuffix = StringUtils.substringBefore(propertiesNameWoutSuffix, ".properties"); String noLanguageFileName = propertiesNameWoutSuffix + ".properties"; String defaultLanguageFileName = propertiesNameWoutSuffix + "_" + defaultLanguage + ".properties"; String otherLanguageFileName = propertiesNameWoutSuffix + "_" + otherLanguage + ".properties"; File noLanguageFile = new File(propertiesDir, noLanguageFileName); File defaultLanguageFile = new File(propertiesDir, defaultLanguageFileName); File otherLanguageFile = new File(propertiesDir, otherLanguageFileName); if (defaultLanguageFile.exists() && otherLanguageFile.exists() && !noLanguageFile.exists()) { System.out.println(defaultLanguageFile.getPath() + " > " + noLanguageFileName); System.out.println(defaultLanguageFile.getPath() + " > empty file"); defaultLanguageFile.renameTo(noLanguageFile); FileWriter defaultLanguageEmptyFileWriter = new FileWriter(defaultLanguageFile); defaultLanguageEmptyFileWriter.write(""); IOUtils.closeQuietly(defaultLanguageEmptyFileWriter); } } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step3AddRawDocumentsFromClueWeb.java
public static void main(String[] args) throws IOException { // input dir - list of xml query containers // step2a-retrieved-results File inputDir = new File(args[0]); // warc.bz file containing all required documents according to ClueWeb IDs // ltr-50queries-100docs-clueweb-export.warc.gz File warc = new File(args[1]); // output dir File outputDir = new File(args[2]); if (!outputDir.exists()) { outputDir.mkdirs();//from w w w .ja va 2 s . c o m } // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); // iterate over warc for each query WARCFileReader reader = new WARCFileReader(new Configuration(), new Path(warc.getAbsolutePath())); try { while (true) { WARCRecord read = reader.read(); String trecId = read.getHeader().getField("WARC-TREC-ID"); // now iterate over retrieved results for the query and find matching IDs for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { if (rankedResults.clueWebID.equals(trecId)) { // add the raw html content String fullHTTPResponse = new String(read.getContent(), "utf-8"); // TODO fix coding? String html = removeHTTPHeaders(fullHTTPResponse); rankedResults.originalHtml = sanitizeXmlChars(html.trim()); } } } } catch (EOFException e) { // end of file } // check if all results have filled html for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { if (rankedResults.originalHtml == null) { System.err.println("Missing original html for\t" + rankedResults.clueWebID + ", setting relevance to false"); rankedResults.relevant = Boolean.FALSE.toString(); } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:com.nuance.expertassistant.ContentCrawler.java
public static void main(String args[]) { if (args.length == 0) { ContentExtractor.startDocument("Test3", "/Users/abhishek_rohatgi/" + "Test3" + ".xml"); ContentExtractor.extract("http://www.audihelp.com/auda-147-tyre_repairs.html"); ContentExtractor.endDocument();/*from w w w . j a va2 s. co m*/ } else { final ContentCrawlerOptions options = new ContentCrawlerOptions(args); final ContentCrawlerInputTypes inputType = ContentCrawlerOptions.getInputType(); if (ContentCrawlerInputTypes.FILE.equals(inputType)) { final File outputFile = new File(getOutput()); if (!outputFile.getParentFile().exists()) { outputFile.getParentFile().mkdirs(); } translateFile(getInput(), getOutput()); } else if (ContentCrawlerInputTypes.FOLDER.equals(inputType)) { final File outputFolder = new File(getOutput()); final Collection<File> inputFiles = FileUtils.listFiles(new File(getInput()), new RegexFileFilter("^(.*\\.(html)?)"), DirectoryFileFilter.DIRECTORY); for (final File inputFile : inputFiles) { final String outputFileName = inputFile.getAbsolutePath().substring(getInput().length()) + ".xml"; final File outputFile = new File(outputFolder, outputFileName); if (!outputFile.getParentFile().exists()) { outputFile.getParentFile().mkdirs(); } translateFile(inputFile.getAbsolutePath(), outputFile.getAbsolutePath()); } } else { ContentExtractor.startDocument(getInput(), getOutput()); ContentExtractor.extract(getInput()); ContentExtractor.endDocument(); } } }
From source file:com.makkajai.ObjCToCpp.java
/** * Main Method//from w ww. j ava 2 s. com * * @param args - First argument is the input directory to scan and second is the output directory to write files to. * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length < 2) { System.out.println("Invalid Arguments!"); System.out.println( "Usage: java com.makkajai.ObjCToCpp \"<directory to scan for .h and .m files>\" \"<directory to write .h and .cpp files>\""); return; } String inputDirectory = args[0]; String outputDirectory = args[1]; // String inputDirectory = "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/scenes"; // String outputDirectory = "/Users/administrator/playground/projarea/monster-math-cross-platform/monster-math-2/Classes/Makkajai/scenes"; List<String> exceptFiles = new ArrayList<String>(); if (args.length == 3) { BufferedReader bufferedInputStream = new BufferedReader(new FileReader(args[2])); String exceptFile = null; while ((exceptFile = bufferedInputStream.readLine()) != null) { if (exceptFile.equals("")) continue; exceptFiles.add(exceptFile); } } //Getting all the files from the input directory. final List<File> files = new ArrayList<File>(FileUtils.listFiles(new File(inputDirectory), new RegexFileFilter(FILE_NAME_WITH_H_OR_M), DirectoryFileFilter.DIRECTORY)); // String fileName = //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Utils/MakkajaiEnum" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Utils/MakkajaiUtil" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Home" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Activities/gnumchmenu/PlayStrategy" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Characters/Character" // "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/Activities/gnumchmenu/GnumchScene" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/ParentScene" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/BaseSkillView" //// "/Users/administrator/playground/projarea/math-monsters-2/makkajai-number-muncher/makkajai-ios/Makkajai/Makkajai/YDLayerBase" // ; //The instance of the translator. ObjCToCppTranslator visitor = new ObjCToCppTranslator(); for (int i = 0; i < files.size();) { File currentFile = files.get(i); String filePathRelativeToInput = currentFile.getAbsolutePath().replace(inputDirectory, ""); Date startTime = new Date(); try { final TranslateFileInput translateFileInput = new TranslateFileInput(inputDirectory, outputDirectory, filePathRelativeToInput, false); if (nextFileIsM(currentFile, files, i)) { try { if (isIgnoredFile(filePathRelativeToInput, exceptFiles)) continue; translateFileInput.dryRun = true; visitor.translateFile(translateFileInput); Date stopTime = new Date(); System.out.println("Dry run File: " + translateFileInput.filePathRelativeToInput + " Time Taken: " + getDelta(startTime, stopTime)); Date startTime1 = new Date(); translateFileInput.filePathRelativeToInput = filePathRelativeToInput.replace(H, M); translateFileInput.dryRun = false; visitor.translateFile(translateFileInput); stopTime = new Date(); System.out.println("Processed File: " + translateFileInput.filePathRelativeToInput + " Time Taken: " + getDelta(startTime1, stopTime)); Date startTime2 = new Date(); translateFileInput.filePathRelativeToInput = filePathRelativeToInput; translateFileInput.dryRun = false; visitor.translateFile(translateFileInput); stopTime = new Date(); System.out.println("Processed File: " + translateFileInput.filePathRelativeToInput + " Time Taken: " + getDelta(startTime2, stopTime)); } catch (Exception e) { e.printStackTrace(); System.out.println("###########################Error Processing: " + filePathRelativeToInput + ", Continuing with next set of tiles"); } finally { i += 2; } continue; } if (!isIgnoredFile(filePathRelativeToInput, exceptFiles)) visitor.translateFile(translateFileInput); i++; } catch (Exception e) { e.printStackTrace(); System.out.println("###########################Error Processing: " + filePathRelativeToInput + ", Continuing with next set of tiles"); } finally { Date stopTime = new Date(); // System.out.println("Processed File(s): " + filePathRelativeToInput.replaceAll(H_OR_M, "") + " Time Taken: " + getDelta(startTime, stopTime)); } } }
From source file:com.github.xbn.examples.io.non_xbn.SizeOrderAllFilesInDirXmpl.java
public static final void main(String[] ignored) { File fDir = (new File("R:\\jeffy\\programming\\sandbox\\xbnjava\\xbn\\")); Collection<File> cllf = FileUtils.listFiles(fDir, (new String[] { "java" }), true); //Add all file paths to a Map, keyed by size. //It's actually a map of lists-of-files, to //allow multiple files that happen to have the //same length. TreeMap<Long, List<File>> tmFilesBySize = new TreeMap<Long, List<File>>(); Iterator<File> itrf = cllf.iterator(); while (itrf.hasNext()) { File f = itrf.next();/*w w w .j a va2s . co m*/ Long LLen = f.length(); if (!tmFilesBySize.containsKey(LLen)) { ArrayList<File> alf = new ArrayList<File>(); alf.add(f); tmFilesBySize.put(LLen, alf); } else { tmFilesBySize.get(LLen).add(f); } } //Iterate backwards by key through the map. For each //List<File>, iterate through the files, printing out //its size and path. ArrayList<Long> alSize = new ArrayList<Long>(tmFilesBySize.keySet()); for (int i = alSize.size() - 1; i >= 0; i--) { itrf = tmFilesBySize.get(alSize.get(i)).iterator(); while (itrf.hasNext()) { File f = itrf.next(); System.out.println(f.length() + ": " + f.getPath()); } } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step4BoilerPlateRemoval.java
public static void main(String[] args) throws IOException { // input dir - list of xml query containers // step3-filled-raw-html File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();// w ww . j a va2 s. c o m } // keep original html? (true == default) boolean keepOriginalHTML = !(args.length > 2 && "false".equals(args[2])); System.out.println(keepOriginalHTML); BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval(); // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // boilerplate removal // there are some empty (corrupted) documents in ClueWeb, namely 0308wb-83.warc.gz if (rankedResults.originalHtml != null) { rankedResults.plainText = boilerPlateRemoval.getMinimalHtml(rankedResults.originalHtml, null); } if (!keepOriginalHTML) { rankedResults.originalHtml = null; } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }
From source file:de.tudarmstadt.ukp.experiments.argumentation.convincingness.sampling.Step4MTurkOutputCollector.java
@SuppressWarnings("unchecked") public static void main(String[] args) throws Exception { String inputDirWithArgumentPairs = args[0]; File[] resultFiles;// www . j a va 2 s . c om if (args[1].contains("*")) { File path = new File(args[1]); File directory = path.getParentFile(); String regex = path.getName().replaceAll("\\*", ""); List<File> files = new ArrayList<>(FileUtils.listFiles(directory, new String[] { regex }, false)); resultFiles = new File[files.size()]; for (int i = 0; i < files.size(); i++) { resultFiles[i] = files.get(i); } } else { // result file is a comma-separated list of CSV files from MTurk String[] split = args[1].split(","); resultFiles = new File[split.length]; for (int i = 0; i < split.length; i++) { resultFiles[i] = new File(split[i]); } } File outputDir = new File(args[2]); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { throw new IOException("Cannot create directory " + outputDir); } } // error if output folder not empty to prevent any confusion by mixing files if (!FileUtils.listFiles(outputDir, null, false).isEmpty()) { throw new IllegalArgumentException("Output dir " + outputDir + " is not empty"); } // collected assignments with empty reason for rejections Set<String> assignmentsWithEmptyReason = new HashSet<>(); // parse with first line as header MTurkOutputReader mTurkOutputReader = new MTurkOutputReader(resultFiles); Collection<File> files = FileUtils.listFiles(new File(inputDirWithArgumentPairs), new String[] { "xml" }, false); if (files.isEmpty()) { throw new IOException("No xml files found in " + inputDirWithArgumentPairs); } // statistics: how many hits with how many assignments ; hit ID / assignments Map<String, Map<String, Integer>> assignmentsPerHits = new HashMap<>(); // collect accept/reject statistics for (Map<String, String> record : mTurkOutputReader) { boolean wasRejected = "Rejected".equals(record.get("assignmentstatus")); String hitID = record.get("hitid"); String hitTypeId = record.get("hittypeid"); if (!wasRejected) { // update statistics if (!assignmentsPerHits.containsKey(hitTypeId)) { assignmentsPerHits.put(hitTypeId, new HashMap<String, Integer>()); } if (!assignmentsPerHits.get(hitTypeId).containsKey(hitID)) { assignmentsPerHits.get(hitTypeId).put(hitID, 0); } assignmentsPerHits.get(hitTypeId).put(hitID, assignmentsPerHits.get(hitTypeId).get(hitID) + 1); } } // statistics: how many hits with how many assignments ; hit ID / assignments Map<String, Integer> approvedAssignmentsPerHit = new HashMap<>(); Map<String, Integer> rejectedAssignmentsPerHit = new HashMap<>(); // collect accept/reject statistics for (Map<String, String> record : mTurkOutputReader) { boolean approved = "Approved".equals(record.get("assignmentstatus")); boolean rejected = "Rejected".equals(record.get("assignmentstatus")); String hitID = record.get("hitid"); if (approved) { // update statistics if (!approvedAssignmentsPerHit.containsKey(hitID)) { approvedAssignmentsPerHit.put(hitID, 0); } approvedAssignmentsPerHit.put(hitID, approvedAssignmentsPerHit.get(hitID) + 1); } else if (rejected) { // update statistics if (!rejectedAssignmentsPerHit.containsKey(hitID)) { rejectedAssignmentsPerHit.put(hitID, 0); } rejectedAssignmentsPerHit.put(hitID, rejectedAssignmentsPerHit.get(hitID) + 1); } else { throw new IllegalStateException( "Unknown state: " + record.get("assignmentstatus") + " HITID: " + hitID); } } // System.out.println("Approved: " + approvedAssignmentsPerHit); // System.out.println("Rejected: " + rejectedAssignmentsPerHit); System.out.println("Approved (values): " + new HashSet<>(approvedAssignmentsPerHit.values())); System.out.println("Rejected (values): " + new HashSet<>(rejectedAssignmentsPerHit.values())); // rejection statistics int totalRejected = 0; for (Map.Entry<String, Integer> rejectionEntry : rejectedAssignmentsPerHit.entrySet()) { totalRejected += rejectionEntry.getValue(); } System.out.println("Total rejections: " + totalRejected); /* // generate .success files for adding more annotations for (File resultFile : resultFiles) { String hitTypeID = mTurkOutputReader.getHitTypeIdForFile().get(resultFile); // assignments for that hittypeid (= file) Map<String, Integer> assignments = assignmentsPerHits.get(hitTypeID); prepareUpdateHITsFiles(assignments, hitTypeID, resultFile); } */ int totalSavedPairs = 0; // load all previously prepared argument pairs for (File file : files) { List<ArgumentPair> argumentPairs = (List<ArgumentPair>) XStreamTools.getXStream().fromXML(file); List<AnnotatedArgumentPair> annotatedArgumentPairs = new ArrayList<>(); for (ArgumentPair argumentPair : argumentPairs) { AnnotatedArgumentPair annotatedArgumentPair = new AnnotatedArgumentPair(argumentPair); // is there such an answer? String key = "Answer." + argumentPair.getId(); // iterate only if there is such column to save time if (mTurkOutputReader.getColumnNames().contains(key)) { // now find the results for (Map<String, String> record : mTurkOutputReader) { if (record.containsKey(key)) { // extract the values AnnotatedArgumentPair.MTurkAssignment assignment = new AnnotatedArgumentPair.MTurkAssignment(); boolean wasRejected = "Rejected".equals(record.get("assignmentstatus")); // only non-rejected (if required) if (!wasRejected) { String hitID = record.get("hitid"); String workerID = record.get("workerid"); String assignmentId = record.get("assignmentid"); try { assignment.setAssignmentAcceptTime( DATE_FORMAT.parse(record.get("assignmentaccepttime"))); assignment.setAssignmentSubmitTime( DATE_FORMAT.parse(record.get("assignmentsubmittime"))); assignment.setHitComment(record.get("Answer.feedback")); assignment.setHitID(hitID); assignment.setTurkID(workerID); assignment.setAssignmentId(assignmentId); // and answer specific fields String valueRaw = record.get(key); // so far the label has had format aXXX_aYYY_a1, aXXX_aYYY_a2, or aXXX_aYYY_equal // strip now only true label String label = valueRaw.split("_")[2]; assignment.setValue(label); String reason = record.get(key + "_reason"); // missing reason if (reason == null) { assignmentsWithEmptyReason.add(assignmentId); } else { assignment.setReason(reason); // get worker's stance String stanceRaw = record.get(key + "_stance"); if (stanceRaw != null) { // parse stance String stance = stanceRaw.split("_stance_")[1]; assignment.setWorkerStance(stance); } // we take maximal 5 assignments Collections.sort(annotatedArgumentPair.mTurkAssignments, new Comparator<AnnotatedArgumentPair.MTurkAssignment>() { @Override public int compare(AnnotatedArgumentPair.MTurkAssignment o1, AnnotatedArgumentPair.MTurkAssignment o2) { return o1.getAssignmentAcceptTime() .compareTo(o2.getAssignmentAcceptTime()); } }); if (annotatedArgumentPair.mTurkAssignments .size() < MAXIMUM_ASSIGNMENTS_PER_HIT) { annotatedArgumentPair.mTurkAssignments.add(assignment); } } } catch (IllegalArgumentException | NullPointerException ex) { System.err.println("Malformed annotations for HIT " + hitID + ", worker " + workerID + ", assignment " + assignmentId + "; " + ex.getMessage() + ", full record: " + record); } } } } } // and if there are some annotations, add it to the result set if (!annotatedArgumentPair.mTurkAssignments.isEmpty()) { annotatedArgumentPairs.add(annotatedArgumentPair); } } if (!annotatedArgumentPairs.isEmpty()) { File outputFile = new File(outputDir, file.getName()); XStreamTools.toXML(annotatedArgumentPairs, outputFile); System.out.println("Saved " + annotatedArgumentPairs.size() + " annotated pairs to " + outputFile); totalSavedPairs += annotatedArgumentPairs.size(); } } System.out.println("Total saved " + totalSavedPairs + " pairs"); // print assignments with empty reasons if (!assignmentsWithEmptyReason.isEmpty()) { System.out.println( "== Assignments with empty reason:\nassignmentIdToReject\tassignmentIdToRejectComment"); for (String assignmentId : assignmentsWithEmptyReason) { System.out.println( assignmentId + "\t\"Dear worker, you did not fill the required field with a reason.\""); } } }
From source file:de.tudarmstadt.ukp.experiments.dip.wp1.documents.Step5LinguisticPreprocessing.java
public static void main(String[] args) throws Exception { // input dir - list of xml query containers // step4-boiler-plate/ File inputDir = new File(args[0]); // output dir File outputDir = new File(args[1]); if (!outputDir.exists()) { outputDir.mkdirs();// w w w. ja v a 2 s . c o m } // iterate over query containers for (File f : FileUtils.listFiles(inputDir, new String[] { "xml" }, false)) { QueryResultContainer queryResultContainer = QueryResultContainer .fromXML(FileUtils.readFileToString(f, "utf-8")); for (QueryResultContainer.SingleRankedResult rankedResults : queryResultContainer.rankedResults) { // System.out.println(rankedResults.plainText); if (rankedResults.plainText != null) { String[] lines = StringUtils.split(rankedResults.plainText, "\n"); // collecting all cleaned lines List<String> cleanLines = new ArrayList<>(lines.length); // collecting line tags List<String> lineTags = new ArrayList<>(lines.length); for (String line : lines) { // get the tag String tag = null; Matcher m = OPENING_TAG_PATTERN.matcher(line); if (m.find()) { tag = m.group(1); } if (tag == null) { throw new IllegalArgumentException("No html tag found for line:\n" + line); } // replace the tag at the beginning and the end String noTagText = line.replaceAll("^<\\S+>", "").replaceAll("</\\S+>$", ""); // do some html cleaning noTagText = noTagText.replaceAll(" ", " "); noTagText = noTagText.trim(); // add to the output if (!noTagText.isEmpty()) { cleanLines.add(noTagText); lineTags.add(tag); } } if (cleanLines.isEmpty()) { // the document is empty System.err.println("Document " + rankedResults.clueWebID + " in query " + queryResultContainer.qID + " is empty"); } else { // now join them back to paragraphs String text = StringUtils.join(cleanLines, "\n"); // create JCas JCas jCas = JCasFactory.createJCas(); jCas.setDocumentText(text); jCas.setDocumentLanguage("en"); // annotate WebParagraph SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(WebParagraphAnnotator.class)); // fill the original tag information List<WebParagraph> webParagraphs = new ArrayList<>( JCasUtil.select(jCas, WebParagraph.class)); // they must be the same size as original ones if (webParagraphs.size() != lineTags.size()) { throw new IllegalStateException( "Different size of annotated paragraphs and original lines"); } for (int i = 0; i < webParagraphs.size(); i++) { WebParagraph p = webParagraphs.get(i); // get tag String tag = lineTags.get(i); p.setOriginalHtmlTag(tag); } SimplePipeline.runPipeline(jCas, AnalysisEngineFactory.createEngineDescription(StanfordSegmenter.class, // only on existing WebParagraph annotations StanfordSegmenter.PARAM_ZONE_TYPES, WebParagraph.class.getCanonicalName())); // now convert to XMI ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream(); XmiCasSerializer.serialize(jCas.getCas(), byteOutputStream); // encode to base64 String encoded = new BASE64Encoder().encode(byteOutputStream.toByteArray()); rankedResults.originalXmi = encoded; } } } // and save the query to output dir File outputFile = new File(outputDir, queryResultContainer.qID + ".xml"); FileUtils.writeStringToFile(outputFile, queryResultContainer.toXML(), "utf-8"); System.out.println("Finished " + outputFile); } }