List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.digitalpebble.behemoth.uima.UIMAMapper.java
License:Apache License
public void configure(JobConf conf) { this.config = conf; storeshortnames = config.getBoolean("uima.store.short.names", true); File pearpath = new File(conf.get("uima.pear.path")); String pearname = pearpath.getName(); URL urlPEAR = null;/* www. j av a 2s . c o m*/ try { Path[] localArchives = DistributedCache.getLocalCacheFiles(conf); // identify the right archive for (Path la : localArchives) { String localPath = la.toUri().toString(); LOG.info("Inspecting local paths " + localPath); if (!localPath.endsWith(pearname)) continue; urlPEAR = new URL("file://" + localPath); break; } } catch (IOException e) { throw new RuntimeException("Impossible to retrieve gate application from distributed cache", e); } if (urlPEAR == null) throw new RuntimeException("UIMA pear " + pearpath + " not available in distributed cache"); File pearFile = new File(urlPEAR.getPath()); // should check whether a different mapper has already unpacked it // but for now we just unpack in a different location for every mapper TaskAttemptID attempt = TaskAttemptID.forName(conf.get("mapred.task.id")); installDir = new File(pearFile.getParentFile(), attempt.toString()); PackageBrowser instPear = PackageInstaller.installPackage(installDir, pearFile, true); // get the resources required for the AnalysisEngine org.apache.uima.resource.ResourceManager rsrcMgr = UIMAFramework.newDefaultResourceManager(); // Create analysis engine from the installed PEAR package using // the created PEAR specifier XMLInputSource in; try { in = new XMLInputSource(instPear.getComponentPearDescPath()); ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in); tae = UIMAFramework.produceAnalysisEngine(specifier, rsrcMgr, null); cas = tae.newCAS(); } catch (Exception e) { throw new RuntimeException(e); } String[] featuresFilters = this.config.get("uima.features.filter", "").split(","); // the featurefilters have the following form : Type:featureName // we group them by annotation type for (String ff : featuresFilters) { String[] fp = ff.split(":"); if (fp.length != 2) continue; Set<Feature> features = featfilts.get(fp[0]); if (features == null) { features = new HashSet<Feature>(); featfilts.put(fp[0], features); } Feature f = cas.getTypeSystem().getFeatureByFullName(ff); if (f != null) features.add(f); } String[] annotTypes = this.config.get("uima.annotations.filter", "").split(","); uimatypes = new ArrayList<Type>(annotTypes.length); for (String type : annotTypes) { Type aType = cas.getTypeSystem().getType(type); uimatypes.add(aType); } }
From source file:com.digitalpebble.behemoth.util.ContentExtractor.java
License:Apache License
private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException { Path input = new Path(inputf); Path dirPath = new Path(outputf); FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); if (fsout.exists(dirPath) == false) fsout.mkdirs(dirPath);/* w w w . ja v a2 s . c o m*/ else { System.err.println("Output " + outputf + " already exists"); return -1; } // index file Path indexPath = new Path(dirPath, "index"); if (fsout.exists(indexPath) == false) { fsout.createNewFile(indexPath); } maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000); index = fsout.create(indexPath); createArchive(dirPath); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateDocs(suPath, dirPath, count); } if (index != null) index.close(); if (currentArchive != null) { currentArchive.finish(); currentArchive.close(); } return 0; }
From source file:com.digitalpebble.behemoth.util.ContentExtractor.java
License:Apache License
private void createArchive(Path dirPath) throws IOException, ArchiveException { FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); String archiveType = "zip"; partNum++;/*from www . j a va 2 s.c o m*/ FSDataOutputStream currentArchiveOS = fsout .create(new Path(dirPath, "part_" + String.format("%06d", partNum) + "." + archiveType)); currentArchive = new ArchiveStreamFactory().createArchiveOutputStream(archiveType, currentArchiveOS); numEntriesInCurrentArchive = 0; }
From source file:com.ebay.erl.mobius.core.criterion.TupleRestrictions.java
License:Apache License
/** * Create a tuple criterion that only accepts tuples when the value * of the <code>column</code> are presented in the given <code>file</code> * <p>/*from ww w. j a va2s. com*/ * * The assumption of the file is that, it's single column and one to many * line text file. Each line is read into a case insensitive set, and * using the set to check the value of the <code>column</code> within * the set or not. * * * @param column the name of a column to be tested that whether its value is in * the given <code>file</code> or not * * @param file a single column and multiple lines of file that contains strings/numbers, * each line is treated as a single unit. * * @return an instance of {@link TupleCriterion} that extracts only the records * when the value of its <code>column</code> are presented in the given * <code>file</code>. * * @throws FileNotFoundException if the given file cannot be found. */ public static TupleCriterion within(final String column, File file) throws FileNotFoundException { final File f = TupleRestrictions.checkFileExist(file); return new TupleCriterion() { private static final long serialVersionUID = -1121221619118915652L; private Set<String> set; @Override public void setConf(Configuration conf) { try { if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf)); } else { conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles")); } } catch (IOException e) { throw new IllegalArgumentException(e); } } /** * COPIED FROM org.apache.hadoop.util.GenericOptionsParser */ private String validateFiles(String files, Configuration conf) throws IOException { if (files == null) return null; String[] fileArr = files.split(","); String[] finalArr = new String[fileArr.length]; for (int i = 0; i < fileArr.length; i++) { String tmp = fileArr[i]; String finalPath; Path path = new Path(tmp); URI pathURI = path.toUri(); FileSystem localFs = FileSystem.getLocal(conf); if (pathURI.getScheme() == null) { // default to the local file system // check if the file exists or not first if (!localFs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(localFs).toString(); } else { // check if the file exists in this file system // we need to recreate this filesystem object to copy // these files to the file system jobtracker is running // on. FileSystem fs = path.getFileSystem(conf); if (!fs.exists(path)) { throw new FileNotFoundException("File " + tmp + " does not exist."); } finalPath = path.makeQualified(fs).toString(); try { fs.close(); } catch (IOException e) { } ; } finalArr[i] = finalPath; } return StringUtils.arrayToString(finalArr); } @Override protected boolean evaluate(Tuple tuple, Configuration configuration) { if (set == null) { set = new CaseInsensitiveTreeSet(); BufferedReader br = null; try { br = new BufferedReader(new FileReader(new File(f.getName()))); String newLine = null; while ((newLine = br.readLine()) != null) { this.set.add(newLine); } } catch (IOException e) { throw new RuntimeException(e); } finally { try { br.close(); } catch (Throwable e) { } } } String value = tuple.getString(column); if (value != null) { return this.set.contains(value); } else { return false; } } @Override public String[] getInvolvedColumns() { return new String[] { column }; } }; }
From source file:com.ebay.erl.mobius.core.fs.MobiusLocalFileSystem.java
License:Apache License
public boolean mkdirs(Path f, FsPermission permission) throws IOException { URI uri = f.toUri(); File file = new File(uri.getPath()); if (!file.exists()) { boolean b = file.mkdirs(); if (!b)//from w w w .j a v a 2 s.c om throw new IOException(file.getAbsolutePath()); return b; } else { return true; } }
From source file:com.ebay.erl.mobius.core.mapred.FileInputFormatHelper.java
License:Apache License
@Override public URI getUniquePathByInputFormat(JobConf conf, Path anInput) throws IOException { // since it's FileInputFormat, the ID can be represented just // using the input path Path result = this.getFileSystem(conf).makeQualified(anInput); if (!this.getFileSystem(conf).isFile(anInput) && result.toUri().getPath().endsWith("/")) { // the given input is a folder but it's path string doesn't // end with slash, then add it can be distinguished by // just it's string representation. result = new Path(result.toString() + "/"); }/*from w w w . j a va 2 s. c o m*/ return this.getFileSystem(conf).makeQualified(result).toUri(); }
From source file:com.ebay.erl.mobius.core.MobiusJob.java
License:Apache License
/** * Test if the given <code>input</code> is the output of another job or not * /* ww w.j ava 2 s .c o m*/ * @param input input path of a job. * @return <code>true</code> if the <code>input</code> is the output * path of another job, <code>false</code> otherwise. */ public boolean isOutputOfAnotherJob(Path input) { // normalize the input first, in case of it doesn't // contain schema (hdfs://, or file:// for example.) Path p = this.getFS().makeQualified(input); LOGGER.info("Current Path Key:" + this.jobTopology.keySet()); LOGGER.info(p.toUri() + " is the output of another job? " + this.jobTopology.containsKey(p.toUri())); return this.jobTopology.containsKey(p.toUri()); }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
public static int run(Configuration conf, Path inputPath, Path topicModelOutputPath, int numTopics, int numTerms, double alpha, double eta, int maxIterations, int iterationBlockSize, double convergenceDelta, Path dictionaryPath, Path docTopicOutputPath, Path topicModelStateTempPath, long randomSeed, float testFraction, int numTrainThreads, int numUpdateThreads, int maxItersPerDoc, int numReduceTasks, boolean backfillPerplexity) throws ClassNotFoundException, IOException, InterruptedException { // verify arguments Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0, "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction); Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0, "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction); String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) " + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, " + "topic/term prior {}. Maximum iterations to run will be {}, unless the change in " + "perplexity is less than {}. Topic model output (p(term|topic) for each topic) will be " + "stored {}. Random initialization seed is {}, holding out {} of the data for perplexity " + "check\n"; log.info(infoString, new Object[] { inputPath, numTerms, numTopics, alpha, eta, maxIterations, convergenceDelta, topicModelOutputPath, randomSeed, testFraction }); infoString = dictionaryPath == null ? "" : "Dictionary to be used located " + dictionaryPath.toString() + '\n'; infoString += docTopicOutputPath == null ? "" : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n'; log.info(infoString);/*from www . j av a 2 s . co m*/ FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf); int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations); log.info("Current iteration number: {}", iterationNumber); conf.set(NUM_TOPICS, String.valueOf(numTopics)); conf.set(NUM_TERMS, String.valueOf(numTerms)); conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha)); conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta)); conf.set(RANDOM_SEED, String.valueOf(randomSeed)); conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads)); conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads)); conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc)); conf.set(MODEL_WEIGHT, "1"); // TODO conf.set(TEST_SET_FRACTION, String.valueOf(testFraction)); List<Double> perplexities = Lists.newArrayList(); for (int i = 1; i <= iterationNumber; i++) { // form path to model Path modelPath = modelPath(topicModelStateTempPath, i); // read perplexity double perplexity = readPerplexity(conf, topicModelStateTempPath, i); if (Double.isNaN(perplexity)) { if (!(backfillPerplexity && i % iterationBlockSize == 0)) { continue; } log.info("Backfilling perplexity at iteration {}", i); if (!fs.exists(modelPath)) { log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation", modelPath.toString(), i); continue; } perplexity = calculatePerplexity(conf, inputPath, modelPath, i); } // register and log perplexity perplexities.add(perplexity); log.info("Perplexity at iteration {} = {}", i, perplexity); } long startTime = System.currentTimeMillis(); while (iterationNumber < maxIterations) { // test convergence if (convergenceDelta > 0.0) { double delta = rateOfChange(perplexities); if (delta < convergenceDelta) { log.info("Convergence achieved at iteration {} with perplexity {} and delta {}", new Object[] { iterationNumber, perplexities.get(perplexities.size() - 1), delta }); break; } } // update model iterationNumber++; log.info("About to run iteration {} of {}", iterationNumber, maxIterations); Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1); Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber); runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations, numReduceTasks); // calculate perplexity if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) { perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber)); log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1)); log.info("(p_{} - p_{}) / p_0 = {}; target = {}", new Object[] { iterationNumber, iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta }); } } log.info("Completed {} iterations in {} seconds", iterationNumber, (System.currentTimeMillis() - startTime) / 1000); log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities)); // write final topic-term and doc-topic distributions Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber); Job topicModelOutputJob = topicModelOutputPath != null ? writeTopicModel(conf, finalIterationData, topicModelOutputPath) : null; Job docInferenceJob = docTopicOutputPath != null ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath) : null; if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) { return -1; } if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) { return -1; } return 0; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
/** * @param topicModelStateTemp/*www.j av a 2 s. c om*/ * @param iteration * @return {@code double[2]} where first value is perplexity and second is model weight of those * documents sampled during perplexity computation, or {@code null} if no perplexity data * exists for the given iteration. * @throws IOException */ public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration) throws IOException { Path perplexityPath = perplexityPath(topicModelStateTemp, iteration); FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf); if (!fs.exists(perplexityPath)) { log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath); return Double.NaN; } double perplexity = 0; double modelWeight = 0; long n = 0; for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>( perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) { modelWeight += pair.getFirst().get(); perplexity += pair.getSecond().get(); n++; } log.info("Read {} entries with total perplexity {} and model weight {}", new Object[] { n, perplexity, modelWeight }); return perplexity / modelWeight; }
From source file:com.elex.dmp.lda.CVB0Driver.java
License:Apache License
private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output) throws IOException, ClassNotFoundException, InterruptedException { String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output); log.info("About to run: " + jobName); Job job = new Job(conf, jobName); job.setMapperClass(CVB0DocInferenceMapper.class); job.setNumReduceTasks(0);//ww w .j a v a 2 s . c o m job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileSystem fs = FileSystem.get(corpus.toUri(), conf); if (modelInput != null && fs.exists(modelInput)) { FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter()); URI[] modelUris = new URI[statuses.length]; for (int i = 0; i < statuses.length; i++) { modelUris[i] = statuses[i].getPath().toUri(); } DistributedCache.setCacheFiles(modelUris, conf); } setModelPaths(job, modelInput);//bug:mahout-1147 FileInputFormat.addInputPath(job, corpus); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(CVB0Driver.class); job.submit(); return job; }