Example usage for org.apache.hadoop.fs Path toUri

List of usage examples for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri() 

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.digitalpebble.behemoth.uima.UIMAMapper.java

License:Apache License

public void configure(JobConf conf) {

    this.config = conf;

    storeshortnames = config.getBoolean("uima.store.short.names", true);

    File pearpath = new File(conf.get("uima.pear.path"));
    String pearname = pearpath.getName();

    URL urlPEAR = null;/* www. j av a  2s . c o  m*/

    try {
        Path[] localArchives = DistributedCache.getLocalCacheFiles(conf);
        // identify the right archive
        for (Path la : localArchives) {
            String localPath = la.toUri().toString();
            LOG.info("Inspecting local paths " + localPath);
            if (!localPath.endsWith(pearname))
                continue;
            urlPEAR = new URL("file://" + localPath);
            break;
        }
    } catch (IOException e) {
        throw new RuntimeException("Impossible to retrieve gate application from distributed cache", e);
    }

    if (urlPEAR == null)
        throw new RuntimeException("UIMA pear " + pearpath + " not available in distributed cache");

    File pearFile = new File(urlPEAR.getPath());

    // should check whether a different mapper has already unpacked it
    // but for now we just unpack in a different location for every mapper
    TaskAttemptID attempt = TaskAttemptID.forName(conf.get("mapred.task.id"));
    installDir = new File(pearFile.getParentFile(), attempt.toString());
    PackageBrowser instPear = PackageInstaller.installPackage(installDir, pearFile, true);

    // get the resources required for the AnalysisEngine
    org.apache.uima.resource.ResourceManager rsrcMgr = UIMAFramework.newDefaultResourceManager();

    // Create analysis engine from the installed PEAR package using
    // the created PEAR specifier
    XMLInputSource in;
    try {
        in = new XMLInputSource(instPear.getComponentPearDescPath());

        ResourceSpecifier specifier = UIMAFramework.getXMLParser().parseResourceSpecifier(in);

        tae = UIMAFramework.produceAnalysisEngine(specifier, rsrcMgr, null);

        cas = tae.newCAS();
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    String[] featuresFilters = this.config.get("uima.features.filter", "").split(",");
    // the featurefilters have the following form : Type:featureName
    // we group them by annotation type
    for (String ff : featuresFilters) {
        String[] fp = ff.split(":");
        if (fp.length != 2)
            continue;
        Set<Feature> features = featfilts.get(fp[0]);
        if (features == null) {
            features = new HashSet<Feature>();
            featfilts.put(fp[0], features);
        }
        Feature f = cas.getTypeSystem().getFeatureByFullName(ff);
        if (f != null)
            features.add(f);
    }

    String[] annotTypes = this.config.get("uima.annotations.filter", "").split(",");
    uimatypes = new ArrayList<Type>(annotTypes.length);

    for (String type : annotTypes) {
        Type aType = cas.getTypeSystem().getType(type);
        uimatypes.add(aType);
    }

}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException {

    Path input = new Path(inputf);
    Path dirPath = new Path(outputf);

    FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf());

    if (fsout.exists(dirPath) == false)
        fsout.mkdirs(dirPath);/*  w w  w .  ja v  a2 s .  c  o  m*/
    else {
        System.err.println("Output " + outputf + " already exists");
        return -1;
    }

    // index file
    Path indexPath = new Path(dirPath, "index");
    if (fsout.exists(indexPath) == false) {
        fsout.createNewFile(indexPath);
    }

    maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000);

    index = fsout.create(indexPath);

    createArchive(dirPath);

    FileSystem fs = input.getFileSystem(getConf());
    FileStatus[] statuses = fs.listStatus(input);
    int count[] = { 0 };
    for (int i = 0; i < statuses.length; i++) {
        FileStatus status = statuses[i];
        Path suPath = status.getPath();
        if (suPath.getName().equals("_SUCCESS"))
            continue;
        generateDocs(suPath, dirPath, count);
    }

    if (index != null)
        index.close();

    if (currentArchive != null) {
        currentArchive.finish();
        currentArchive.close();
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.ContentExtractor.java

License:Apache License

private void createArchive(Path dirPath) throws IOException, ArchiveException {
    FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf());
    String archiveType = "zip";
    partNum++;/*from   www  . j  a  va 2  s.c o  m*/
    FSDataOutputStream currentArchiveOS = fsout
            .create(new Path(dirPath, "part_" + String.format("%06d", partNum) + "." + archiveType));
    currentArchive = new ArchiveStreamFactory().createArchiveOutputStream(archiveType, currentArchiveOS);
    numEntriesInCurrentArchive = 0;
}

From source file:com.ebay.erl.mobius.core.criterion.TupleRestrictions.java

License:Apache License

/**
 * Create a tuple criterion that only accepts tuples when the value 
 * of the <code>column</code> are presented in the given <code>file</code>
 * <p>/*from   ww w. j  a  va2s. com*/
 * 
 * The assumption of the file is that, it's single column and one to many
 * line text file.  Each line is read into a case insensitive set, and 
 * using the set to check the value of the <code>column</code> within
 * the set or not.
 * 
 * 
 * @param column the name of a column to be tested that whether its value is in 
 * the given <code>file</code> or not
 * 
 * @param file a single column and multiple lines of file that contains strings/numbers,
 * each line is treated as a single unit.
 *
 * @return an instance of {@link TupleCriterion} that extracts only the records 
 * when the value of its <code>column</code> are presented in the given 
 * <code>file</code>.
 * 
 * @throws FileNotFoundException if the given file cannot be found.
 */
public static TupleCriterion within(final String column, File file) throws FileNotFoundException {
    final File f = TupleRestrictions.checkFileExist(file);

    return new TupleCriterion() {

        private static final long serialVersionUID = -1121221619118915652L;
        private Set<String> set;

        @Override
        public void setConf(Configuration conf) {
            try {
                if (conf.get("tmpfiles") == null || conf.get("tmpfiles").trim().length() == 0) {
                    conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf));
                } else {
                    conf.set("tmpfiles", validateFiles(f.getAbsolutePath(), conf) + "," + conf.get("tmpfiles"));
                }

            } catch (IOException e) {
                throw new IllegalArgumentException(e);
            }
        }

        /**
         * COPIED FROM org.apache.hadoop.util.GenericOptionsParser
         */
        private String validateFiles(String files, Configuration conf) throws IOException {
            if (files == null)
                return null;
            String[] fileArr = files.split(",");
            String[] finalArr = new String[fileArr.length];
            for (int i = 0; i < fileArr.length; i++) {
                String tmp = fileArr[i];
                String finalPath;
                Path path = new Path(tmp);
                URI pathURI = path.toUri();
                FileSystem localFs = FileSystem.getLocal(conf);
                if (pathURI.getScheme() == null) {
                    // default to the local file system
                    // check if the file exists or not first
                    if (!localFs.exists(path)) {
                        throw new FileNotFoundException("File " + tmp + " does not exist.");
                    }
                    finalPath = path.makeQualified(localFs).toString();
                } else {
                    // check if the file exists in this file system
                    // we need to recreate this filesystem object to copy
                    // these files to the file system jobtracker is running
                    // on.
                    FileSystem fs = path.getFileSystem(conf);
                    if (!fs.exists(path)) {
                        throw new FileNotFoundException("File " + tmp + " does not exist.");
                    }
                    finalPath = path.makeQualified(fs).toString();
                    try {
                        fs.close();
                    } catch (IOException e) {
                    }
                    ;
                }
                finalArr[i] = finalPath;
            }
            return StringUtils.arrayToString(finalArr);
        }

        @Override
        protected boolean evaluate(Tuple tuple, Configuration configuration) {
            if (set == null) {
                set = new CaseInsensitiveTreeSet();
                BufferedReader br = null;
                try {
                    br = new BufferedReader(new FileReader(new File(f.getName())));
                    String newLine = null;
                    while ((newLine = br.readLine()) != null) {
                        this.set.add(newLine);
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                } finally {
                    try {
                        br.close();
                    } catch (Throwable e) {
                    }
                }
            }

            String value = tuple.getString(column);
            if (value != null) {
                return this.set.contains(value);
            } else {
                return false;
            }
        }

        @Override
        public String[] getInvolvedColumns() {
            return new String[] { column };
        }
    };
}

From source file:com.ebay.erl.mobius.core.fs.MobiusLocalFileSystem.java

License:Apache License

public boolean mkdirs(Path f, FsPermission permission) throws IOException {
    URI uri = f.toUri();

    File file = new File(uri.getPath());

    if (!file.exists()) {
        boolean b = file.mkdirs();
        if (!b)//from   w  w  w .j a  v  a 2 s.c  om
            throw new IOException(file.getAbsolutePath());
        return b;
    } else {
        return true;
    }
}

From source file:com.ebay.erl.mobius.core.mapred.FileInputFormatHelper.java

License:Apache License

@Override
public URI getUniquePathByInputFormat(JobConf conf, Path anInput) throws IOException {
    // since it's FileInputFormat, the ID can be represented just 
    // using the input path

    Path result = this.getFileSystem(conf).makeQualified(anInput);

    if (!this.getFileSystem(conf).isFile(anInput) && result.toUri().getPath().endsWith("/")) {
        // the given input is a folder but it's path string doesn't
        // end with slash, then add it can be distinguished by
        // just it's string representation.

        result = new Path(result.toString() + "/");
    }/*from   w w w  . j a  va  2 s. c  o m*/

    return this.getFileSystem(conf).makeQualified(result).toUri();
}

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

/**
 * Test if the given <code>input</code> is the output of another job or not
 * /* ww w.j  ava 2 s .c  o m*/
 * @param input input path of a job.
 * @return <code>true</code> if the <code>input</code> is the output
 * path of another job, <code>false</code> otherwise.
 */
public boolean isOutputOfAnotherJob(Path input) {
    // normalize the input first, in case of it doesn't 
    // contain schema (hdfs://, or file:// for example.)
    Path p = this.getFS().makeQualified(input);
    LOGGER.info("Current Path Key:" + this.jobTopology.keySet());
    LOGGER.info(p.toUri() + " is the output of another job? " + this.jobTopology.containsKey(p.toUri()));

    return this.jobTopology.containsKey(p.toUri());
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

public static int run(Configuration conf, Path inputPath, Path topicModelOutputPath, int numTopics,
        int numTerms, double alpha, double eta, int maxIterations, int iterationBlockSize,
        double convergenceDelta, Path dictionaryPath, Path docTopicOutputPath, Path topicModelStateTempPath,
        long randomSeed, float testFraction, int numTrainThreads, int numUpdateThreads, int maxItersPerDoc,
        int numReduceTasks, boolean backfillPerplexity)
        throws ClassNotFoundException, IOException, InterruptedException {
    // verify arguments
    Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0,
            "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction);
    Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0,
            "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction);

    String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) "
            + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, "
            + "topic/term prior {}.  Maximum iterations to run will be {}, unless the change in "
            + "perplexity is less than {}.  Topic model output (p(term|topic) for each topic) will be "
            + "stored {}.  Random initialization seed is {}, holding out {} of the data for perplexity "
            + "check\n";
    log.info(infoString, new Object[] { inputPath, numTerms, numTopics, alpha, eta, maxIterations,
            convergenceDelta, topicModelOutputPath, randomSeed, testFraction });
    infoString = dictionaryPath == null ? ""
            : "Dictionary to be used located " + dictionaryPath.toString() + '\n';
    infoString += docTopicOutputPath == null ? ""
            : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n';
    log.info(infoString);/*from www . j av a  2  s  .  co  m*/

    FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf);
    int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
    log.info("Current iteration number: {}", iterationNumber);

    conf.set(NUM_TOPICS, String.valueOf(numTopics));
    conf.set(NUM_TERMS, String.valueOf(numTerms));
    conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha));
    conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta));
    conf.set(RANDOM_SEED, String.valueOf(randomSeed));
    conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads));
    conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads));
    conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc));
    conf.set(MODEL_WEIGHT, "1"); // TODO
    conf.set(TEST_SET_FRACTION, String.valueOf(testFraction));

    List<Double> perplexities = Lists.newArrayList();
    for (int i = 1; i <= iterationNumber; i++) {
        // form path to model
        Path modelPath = modelPath(topicModelStateTempPath, i);

        // read perplexity
        double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
        if (Double.isNaN(perplexity)) {
            if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
                continue;
            }
            log.info("Backfilling perplexity at iteration {}", i);
            if (!fs.exists(modelPath)) {
                log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation",
                        modelPath.toString(), i);
                continue;
            }
            perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
        }

        // register and log perplexity
        perplexities.add(perplexity);
        log.info("Perplexity at iteration {} = {}", i, perplexity);
    }

    long startTime = System.currentTimeMillis();
    while (iterationNumber < maxIterations) {
        // test convergence
        if (convergenceDelta > 0.0) {
            double delta = rateOfChange(perplexities);
            if (delta < convergenceDelta) {
                log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
                        new Object[] { iterationNumber, perplexities.get(perplexities.size() - 1), delta });
                break;
            }
        }

        // update model
        iterationNumber++;
        log.info("About to run iteration {} of {}", iterationNumber, maxIterations);
        Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
        Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
        runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations,
                numReduceTasks);

        // calculate perplexity
        if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) {
            perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
            log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
            log.info("(p_{} - p_{}) / p_0 = {}; target = {}", new Object[] { iterationNumber,
                    iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta });
        }
    }
    log.info("Completed {} iterations in {} seconds", iterationNumber,
            (System.currentTimeMillis() - startTime) / 1000);
    log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities));

    // write final topic-term and doc-topic distributions
    Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
    Job topicModelOutputJob = topicModelOutputPath != null
            ? writeTopicModel(conf, finalIterationData, topicModelOutputPath)
            : null;
    Job docInferenceJob = docTopicOutputPath != null
            ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath)
            : null;
    if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) {
        return -1;
    }
    if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) {
        return -1;
    }
    return 0;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

/**
 * @param topicModelStateTemp/*www.j  av a  2 s.  c  om*/
 * @param iteration
 * @return {@code double[2]} where first value is perplexity and second is model weight of those
 *         documents sampled during perplexity computation, or {@code null} if no perplexity data
 *         exists for the given iteration.
 * @throws IOException
 */
public static double readPerplexity(Configuration conf, Path topicModelStateTemp, int iteration)
        throws IOException {
    Path perplexityPath = perplexityPath(topicModelStateTemp, iteration);
    FileSystem fs = FileSystem.get(perplexityPath.toUri(), conf);
    if (!fs.exists(perplexityPath)) {
        log.warn("Perplexity path {} does not exist, returning NaN", perplexityPath);
        return Double.NaN;
    }
    double perplexity = 0;
    double modelWeight = 0;
    long n = 0;
    for (Pair<DoubleWritable, DoubleWritable> pair : new SequenceFileDirIterable<DoubleWritable, DoubleWritable>(
            perplexityPath, PathType.LIST, PathFilters.partFilter(), null, true, conf)) {
        modelWeight += pair.getFirst().get();
        perplexity += pair.getSecond().get();
        n++;
    }
    log.info("Read {} entries with total perplexity {} and model weight {}",
            new Object[] { n, perplexity, modelWeight });
    return perplexity / modelWeight;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static Job writeDocTopicInference(Configuration conf, Path corpus, Path modelInput, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {
    String jobName = String.format("Writing final document/topic inference from %s to %s", corpus, output);
    log.info("About to run: " + jobName);
    Job job = new Job(conf, jobName);
    job.setMapperClass(CVB0DocInferenceMapper.class);
    job.setNumReduceTasks(0);//ww  w .j  a v  a 2 s  . c o  m
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileSystem fs = FileSystem.get(corpus.toUri(), conf);
    if (modelInput != null && fs.exists(modelInput)) {
        FileStatus[] statuses = fs.listStatus(modelInput, PathFilters.partFilter());
        URI[] modelUris = new URI[statuses.length];
        for (int i = 0; i < statuses.length; i++) {
            modelUris[i] = statuses[i].getPath().toUri();
        }
        DistributedCache.setCacheFiles(modelUris, conf);
    }
    setModelPaths(job, modelInput);//bug:mahout-1147
    FileInputFormat.addInputPath(job, corpus);
    FileOutputFormat.setOutputPath(job, output);
    job.setJarByClass(CVB0Driver.class);
    job.submit();
    return job;
}