Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.ebay.erl.mobius.core.builder.AbstractDatasetBuilder.java

License:Apache License

/**
 * Add the <code>paths</code> to the underline dataset.  A boolean
 * flag <code>validatePathExistance</code> to specify if Mobius
 * needs to verify the specified <code>paths</code> exist or not.
 * <p>/*  w  w  w.jav  a2s .  c o  m*/
 * 
 * If <code>validatePathExistance</code> is true, and one of the
 * <code>paths</code> doesn't exist, <code>IOException</code> will
 * be thrown.
 * <p>
 * 
 * If a path exists and it's a folder, {@link #checkTouchFile(FileSystem, Path)} 
 * will be called to see if a touch file exists under that folder or not.
 * The default implementation of <code>checkTouchFile</code> always return
 * true, which means the dataset builder doesn't check touch file by default.
 * If this is a need to check touch file, the subclass should override that
 * function, and when the funciton return false, <code>IOException</code>
 * will be thrown here for that specific path.
 */
protected ACTUAL_BUILDER_IMPL addInputPath(boolean validatePathExistance, Path... paths) throws IOException {
    if (paths == null || paths.length == 0) {
        throw new IllegalArgumentException("Please specify at least one path");
    }

    FileSystem fs = FileSystem.get(this.mobiusJob.getConf());

    for (Path aPath : paths) {
        FileStatus[] fileStatus = null;

        try {
            fileStatus = fs.globStatus(aPath);
        } catch (NullPointerException e) {
            LOGGER.warn("FileSystem list globStatus thrown NPE", e);
        }

        if (fileStatus == null) {
            if (validatePathExistance) {
                throw new FileNotFoundException(aPath.toString() + " doesn't exist on file system.");
            } else {
                // no need to validate, as the input
                // for this dataset is coming from
                // the output of the other dataset.
                this.getDataset().addInputs(aPath);
            }
        } else {
            // file(s) exists, add inputs
            for (FileStatus aFileStatus : fileStatus) {
                Path p = aFileStatus.getPath();
                if (!fs.isFile(p)) {
                    if (!this.checkTouchFile(fs, p)) {
                        throw new IllegalStateException(
                                "No touch file under " + p.toString() + ", this dataset is not ready.");
                    } else {
                        this.getDataset().addInputs(p);
                    }
                } else {
                    this.getDataset().addInputs(p);
                }
            }
        }
    }

    return (ACTUAL_BUILDER_IMPL) this;
}

From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java

License:Apache License

/**
 * Set the path to the SequenceFile storing the sorted partition keyset.
 * It must be the case that for <tt>R</tt> reduces, there are <tt>R-1</tt>
 * keys in the SequenceFile./*from  ww w  .  j ava 2  s .  c o m*/
 */
public static void setPartitionFile(JobConf job, Path p) {
    job.set("total.order.partitioner.path", p.toString());
}

From source file:com.ebay.erl.mobius.core.JobSetup.java

License:Apache License

private static void ensureOutputDelete(Path outputFolder, Configuration conf) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    outputFolder = fs.makeQualified(outputFolder);
    if (fs.exists(outputFolder)) {
        LOGGER.info("Deleting " + outputFolder.toString());
        fs.delete(outputFolder, true);/*w  ww  .  ja v a2  s  . c  om*/
    }
}

From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java

License:Apache License

private static void writePartitionFile(JobConf job, Sampler sampler) {
    try {//from w  w w . ja va  2 s .  com
        ////////////////////////////////////////////////
        // first, getting samples from the data sources
        ////////////////////////////////////////////////
        LOGGER.info("Running local sampling for job [" + job.getJobName() + "]");
        InputFormat inf = job.getInputFormat();
        Object[] samples = sampler.getSample(inf, job);
        LOGGER.info("Samples retrieved, sorting...");

        ////////////////////////////////////////////////
        // sort the samples
        ////////////////////////////////////////////////
        RawComparator comparator = job.getOutputKeyComparator();
        Arrays.sort(samples, comparator);

        if (job.getBoolean("mobius.print.sample", false)) {
            PrintWriter pw = new PrintWriter(
                    new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(
                            new File(job.get("mobius.sample.file", "./samples.txt.gz")))))));
            for (Object obj : samples) {
                pw.println(obj);
            }
            pw.flush();
            pw.close();
        }

        ////////////////////////////////////////////////
        // start to write partition files
        ////////////////////////////////////////////////

        FileSystem fs = FileSystem.get(job);
        Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job)));
        while (fs.exists(partitionFile)) {
            partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis());
        }
        fs.deleteOnExit(partitionFile);
        TotalOrderPartitioner.setPartitionFile(job, partitionFile);
        LOGGER.info("write partition file to:" + partitionFile.toString());

        int reducersNbr = job.getNumReduceTasks();
        Set<Object> wroteSamples = new HashSet<Object>();

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class,
                NullWritable.class);

        float avgReduceSize = samples.length / reducersNbr;

        int lastBegin = 0;
        for (int i = 0; i < samples.length;) {
            // trying to distribute the load for every reducer evenly,
            // dividing the <code>samples</code> into a set of blocks
            // separated by boundaries, objects that selected from the
            // <code>samples</code> array, and each blocks should have
            // about the same size.

            // find the last index of element that equals to samples[i], as
            // such element might appear multiple times in the samples.
            int upperBound = Util.findUpperBound(samples, samples[i], comparator);

            int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator);

            // the repeat time of samples[i], if the key itself is too big
            // select it as boundary
            int currentElemSize = upperBound - lowerBound + 1;

            if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size
            {
                // the current element is too big, greater than
                // two times of the <code>avgReduceSize</code>, 
                // put itself as boundary
                writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                //pw.println(samples[i]);

                // immediate put the next element to the boundary,
                // the next element starts at <code> upperBound+1
                // </code>, to prevent the current one consume even 
                // more.
                if (upperBound + 1 < samples.length) {
                    writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey());
                    //pw.println(samples[upperBound+1]);

                    // move on to the next element of <code>samples[upperBound+1]/code>
                    lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1;
                    i = lastBegin;
                } else {
                    break;
                }
            } else {
                // current element is small enough to be consider
                // with previous group
                int size = upperBound - lastBegin;
                if (size > avgReduceSize) {
                    // by including the current elements, we have
                    // found a block that's big enough, select it
                    // as boundary
                    writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                    //pw.println(samples[i]);

                    i = upperBound + 1;
                    lastBegin = i;
                } else {
                    i = upperBound + 1;
                }
            }
        }

        writer.close();

        // if the number of wrote samples doesn't equals to number of
        // reducer minus one, then it means the key spaces is too small
        // hence TotalOrderPartitioner won't work, it works only if 
        // the partition boundaries are distinct.
        //
        // we need to change the number of reducers
        if (wroteSamples.size() + 1 != reducersNbr) {
            LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size()
                    + ", reducer size:" + (reducersNbr));
            LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1));

            // add 1 because the wrote samples define boundary, ex, if
            // the sample size is two with two element [300, 1000], then 
            // there should be 3 reducers, one for handling i<300, one 
            // for n300<=i<1000, and another one for 1000<=i
            job.setNumReduceTasks((wroteSamples.size() + 1));
        }

        samples = null;
    } catch (IOException e) {
        LOGGER.error(e.getMessage(), e);
        throw new RuntimeException(e);
    }
}

From source file:com.ebay.erl.mobius.core.mapred.FileInputFormatHelper.java

License:Apache License

@Override
public URI getUniquePathByInputFormat(JobConf conf, Path anInput) throws IOException {
    // since it's FileInputFormat, the ID can be represented just 
    // using the input path

    Path result = this.getFileSystem(conf).makeQualified(anInput);

    if (!this.getFileSystem(conf).isFile(anInput) && result.toUri().getPath().endsWith("/")) {
        // the given input is a folder but it's path string doesn't
        // end with slash, then add it can be distinguished by
        // just it's string representation.

        result = new Path(result.toString() + "/");
    }/*www .j a v  a  2 s .  com*/

    return this.getFileSystem(conf).makeQualified(result).toUri();
}

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

void deleteTempFiles() throws IOException {
    LOGGER.info("Cleanning temporal files...");

    for (Path aTempFile : this.tempFiles) {
        if (!this.getFS().delete(aTempFile, true)) {
            LOGGER.warn("Cannot delete temp file:" + aTempFile.toString());
        } else {//from   www . ja  v  a2  s.c  o  m
            LOGGER.info(aTempFile.toString() + " deleted.");
        }
    }
    LOGGER.info("All temporal files are deleted.");
}

From source file:com.ebay.erl.mobius.core.MobiusJob.java

License:Apache License

/**
 * create an empty folder under hadoop.tmp.dir.
 *///from ww  w .  java  2s  .  com
public Path newTempPath() throws IOException {
    Path tmp = new Path(this.getConf().get("hadoop.tmp.dir"), String.valueOf(System.currentTimeMillis()));
    while (this.getFS().exists(tmp)) {
        tmp = new Path(this.getConf().get("hadoop.tmp.dir"), String.valueOf(System.currentTimeMillis()));
    }

    if (!this.getFS().mkdirs(tmp)) {
        throw new IOException("Cannot create temp file:" + tmp.toString() + ".");
    }

    // remember the temp file so it can be deleted after
    // this job has completed.
    this.tempFiles.add(tmp);

    return tmp;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

public static int run(Configuration conf, Path inputPath, Path topicModelOutputPath, int numTopics,
        int numTerms, double alpha, double eta, int maxIterations, int iterationBlockSize,
        double convergenceDelta, Path dictionaryPath, Path docTopicOutputPath, Path topicModelStateTempPath,
        long randomSeed, float testFraction, int numTrainThreads, int numUpdateThreads, int maxItersPerDoc,
        int numReduceTasks, boolean backfillPerplexity)
        throws ClassNotFoundException, IOException, InterruptedException {
    // verify arguments
    Preconditions.checkArgument(testFraction >= 0.0 && testFraction <= 1.0,
            "Expected 'testFraction' value in range [0, 1] but found value '%s'", testFraction);
    Preconditions.checkArgument(!backfillPerplexity || testFraction > 0.0,
            "Expected 'testFraction' value in range (0, 1] but found value '%s'", testFraction);

    String infoString = "Will run Collapsed Variational Bayes (0th-derivative approximation) "
            + "learning for LDA on {} (numTerms: {}), finding {}-topics, with document/topic prior {}, "
            + "topic/term prior {}.  Maximum iterations to run will be {}, unless the change in "
            + "perplexity is less than {}.  Topic model output (p(term|topic) for each topic) will be "
            + "stored {}.  Random initialization seed is {}, holding out {} of the data for perplexity "
            + "check\n";
    log.info(infoString, new Object[] { inputPath, numTerms, numTopics, alpha, eta, maxIterations,
            convergenceDelta, topicModelOutputPath, randomSeed, testFraction });
    infoString = dictionaryPath == null ? ""
            : "Dictionary to be used located " + dictionaryPath.toString() + '\n';
    infoString += docTopicOutputPath == null ? ""
            : "p(topic|docId) will be stored " + docTopicOutputPath.toString() + '\n';
    log.info(infoString);/*www .  ja v a2  s . c  om*/

    FileSystem fs = FileSystem.get(topicModelStateTempPath.toUri(), conf);
    int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
    log.info("Current iteration number: {}", iterationNumber);

    conf.set(NUM_TOPICS, String.valueOf(numTopics));
    conf.set(NUM_TERMS, String.valueOf(numTerms));
    conf.set(DOC_TOPIC_SMOOTHING, String.valueOf(alpha));
    conf.set(TERM_TOPIC_SMOOTHING, String.valueOf(eta));
    conf.set(RANDOM_SEED, String.valueOf(randomSeed));
    conf.set(NUM_TRAIN_THREADS, String.valueOf(numTrainThreads));
    conf.set(NUM_UPDATE_THREADS, String.valueOf(numUpdateThreads));
    conf.set(MAX_ITERATIONS_PER_DOC, String.valueOf(maxItersPerDoc));
    conf.set(MODEL_WEIGHT, "1"); // TODO
    conf.set(TEST_SET_FRACTION, String.valueOf(testFraction));

    List<Double> perplexities = Lists.newArrayList();
    for (int i = 1; i <= iterationNumber; i++) {
        // form path to model
        Path modelPath = modelPath(topicModelStateTempPath, i);

        // read perplexity
        double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
        if (Double.isNaN(perplexity)) {
            if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
                continue;
            }
            log.info("Backfilling perplexity at iteration {}", i);
            if (!fs.exists(modelPath)) {
                log.error("Model path '{}' does not exist; Skipping iteration {} perplexity calculation",
                        modelPath.toString(), i);
                continue;
            }
            perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
        }

        // register and log perplexity
        perplexities.add(perplexity);
        log.info("Perplexity at iteration {} = {}", i, perplexity);
    }

    long startTime = System.currentTimeMillis();
    while (iterationNumber < maxIterations) {
        // test convergence
        if (convergenceDelta > 0.0) {
            double delta = rateOfChange(perplexities);
            if (delta < convergenceDelta) {
                log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
                        new Object[] { iterationNumber, perplexities.get(perplexities.size() - 1), delta });
                break;
            }
        }

        // update model
        iterationNumber++;
        log.info("About to run iteration {} of {}", iterationNumber, maxIterations);
        Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
        Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
        runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations,
                numReduceTasks);

        // calculate perplexity
        if (testFraction > 0 && iterationNumber % iterationBlockSize == 0) {
            perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
            log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
            log.info("(p_{} - p_{}) / p_0 = {}; target = {}", new Object[] { iterationNumber,
                    iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta });
        }
    }
    log.info("Completed {} iterations in {} seconds", iterationNumber,
            (System.currentTimeMillis() - startTime) / 1000);
    log.info("Perplexities: ({})", Joiner.on(", ").join(perplexities));

    // write final topic-term and doc-topic distributions
    Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
    Job topicModelOutputJob = topicModelOutputPath != null
            ? writeTopicModel(conf, finalIterationData, topicModelOutputPath)
            : null;
    Job docInferenceJob = docTopicOutputPath != null
            ? writeDocTopicInference(conf, inputPath, finalIterationData, docTopicOutputPath)
            : null;
    if (topicModelOutputJob != null && !topicModelOutputJob.waitForCompletion(true)) {
        return -1;
    }
    if (docInferenceJob != null && !docInferenceJob.waitForCompletion(true)) {
        return -1;
    }
    return 0;
}

From source file:com.elex.dmp.lda.CVB0Driver.java

License:Apache License

private static void setModelPaths(Job job, Path modelPath) throws IOException {
    Configuration conf = job.getConfiguration();
    if (modelPath == null || !FileSystem.get(modelPath.toUri(), conf).exists(modelPath)) {
        return;/*from w  w w.ja v a  2s  .  c o  m*/
    }
    FileStatus[] statuses = FileSystem.get(modelPath.toUri(), conf).listStatus(modelPath,
            PathFilters.partFilter());
    Preconditions.checkState(statuses.length > 0, "No part files found in model path '%s'",
            modelPath.toString());
    String[] modelPaths = new String[statuses.length];
    for (int i = 0; i < statuses.length; i++) {
        modelPaths[i] = statuses[i].getPath().toUri().toString();
    }
    conf.setStrings(MODEL_PATHS, modelPaths);
}

From source file:com.ery.hadoop.mrddx.file.LineRecordReader.java

License:Apache License

void openFile() throws IOException {
    start = split.getStart();/*from   w w w  .  j  ava  2 s.  c o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
            + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);
    if (file.getName().endsWith(".zip")) {
        LOG.info("use ZipInputStream read file " + split.getPath());
        ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
        in = new LineReader(zin, job);
        filePosition = fileIn;
        codec = new GzipCodec();
        return;
    }
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            // tar.gzTarInputStream
            // new TarInputStream(codec.createInputStream(fileIn,
            // decompressor)
            String filename = file.getName();
            if (filename.endsWith(".tar.gz")) {
                in = new LineReader(new TarInputStream(cIn), job);
            } else {
                in = new LineReader(cIn, job);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            String filename = file.getName();
            if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        String filename = file.getName();
        if (filename.endsWith(".tar")) {
            in = new LineReader(new TarInputStream(fileIn), job);
        } else {
            in = new LineReader(fileIn, job);
        }

        filePosition = fileIn;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}