Example usage for org.apache.hadoop.fs Path getParent

List of usage examples for org.apache.hadoop.fs Path getParent

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getParent.

Prototype

public Path getParent() 

Source Link

Document

Returns the parent of a path or null if at root.

Usage

From source file:org.apache.mahout.clustering.minhash.LastfmDataConverter.java

License:Apache License

/**
 * Converts each record in (item,features) map into Mahout vector format and
 * writes it into sequencefile for minhash clustering
 *//*  w w w. j  a  v  a 2 s.  co m*/
public static boolean writeToSequenceFile(Map<String, List<Integer>> itemFeaturesMap, Path outputPath)
        throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    fs.mkdirs(outputPath.getParent());
    long totalRecords = itemFeaturesMap.size();
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, outputPath, Text.class,
            VectorWritable.class);
    try {
        String msg = "Now writing vectorized data in sequence file format: ";
        System.out.print(msg);

        Text itemWritable = new Text();
        VectorWritable featuresWritable = new VectorWritable();

        int doneRecords = 0;
        int prevPercentDone = 1;

        for (Map.Entry<String, List<Integer>> itemFeature : itemFeaturesMap.entrySet()) {
            int numfeatures = itemFeature.getValue().size();
            itemWritable.set(itemFeature.getKey());
            Vector featureVector = new SequentialAccessSparseVector(numfeatures);
            int i = 0;
            for (Integer feature : itemFeature.getValue()) {
                featureVector.setQuick(i++, feature);
            }
            featuresWritable.set(featureVector);
            writer.append(itemWritable, featuresWritable);
            // Update the progress
            double percentDone = ++doneRecords * 100.0 / totalRecords;
            if (percentDone > prevPercentDone) {
                System.out.print('\r' + msg + percentDone + "% " + (percentDone >= 100 ? "Completed\n" : ""));
                prevPercentDone++;
            }
        }
    } finally {
        Closeables.closeQuietly(writer);
    }
    return true;
}

From source file:org.apache.mahout.clustering.spectral.common.MatrixDiagonalizeJob.java

License:Apache License

public static Vector runJob(Path affInput, int dimensions)
        throws IOException, ClassNotFoundException, InterruptedException {

    // set up all the job tasks
    Configuration conf = new Configuration();
    Path diagOutput = new Path(affInput.getParent(), "diagonal");
    HadoopUtil.delete(conf, diagOutput);
    conf.setInt(EigencutsKeys.AFFINITY_DIMENSIONS, dimensions);
    Job job = new Job(conf, "MatrixDiagonalizeJob");

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(IntDoublePairWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(MatrixDiagonalizeMapper.class);
    job.setReducerClass(MatrixDiagonalizeReducer.class);

    FileInputFormat.addInputPath(job, affInput);
    FileOutputFormat.setOutputPath(job, diagOutput);

    job.setJarByClass(MatrixDiagonalizeJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }// ww w. java2s .c  om

    // read the results back from the path
    return VectorCache.load(conf, new Path(diagOutput, "part-r-00000"));
}

From source file:org.apache.mahout.clustering.spectral.common.VectorMatrixMultiplicationJob.java

License:Apache License

public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
        throws IOException, ClassNotFoundException, InterruptedException {

    // set up the serialization of the diagonal vector
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
    markovPath = fs.makeQualified(markovPath);
    outputPath = fs.makeQualified(outputPath);
    Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
    VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);

    // set up the job itself
    Job job = new Job(conf, "VectorMatrixMultiplication");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(VectorMatrixMultiplicationMapper.class);
    job.setNumReduceTasks(0);//from   w w  w  .j a  v a2  s .c  om

    FileInputFormat.addInputPath(job, markovPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setJarByClass(VectorMatrixMultiplicationJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size());
}

From source file:org.apache.mahout.clustering.spectral.eigencuts.EigencutsSensitivityJob.java

License:Apache License

/**
 * Initializes the configuration tasks, loads the needed data into
 * the HDFS cache, and executes the job.
 * //from w  w  w .  j  av a 2  s .  c  om
 * @param eigenvalues Vector of eigenvalues
 * @param diagonal Vector representing the diagonal matrix
 * @param eigenvectors Path to the DRM of eigenvectors
 * @param output Path to the output matrix (will have between n and full-rank
 *                non-zero elements)
 */
public static void runJob(Vector eigenvalues, Vector diagonal, Path eigenvectors, double beta, double tau,
        double delta, double epsilon, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {

    // save the two vectors to the distributed cache
    Configuration jobConfig = new Configuration();
    Path eigenOutputPath = new Path(output.getParent(), "eigenvalues");
    Path diagOutputPath = new Path(output.getParent(), "diagonal");
    jobConfig.set(EigencutsKeys.VECTOR_CACHE_BASE, output.getParent().getName());
    VectorCache.save(new IntWritable(EigencutsKeys.EIGENVALUES_CACHE_INDEX), eigenvalues, eigenOutputPath,
            jobConfig);
    VectorCache.save(new IntWritable(EigencutsKeys.DIAGONAL_CACHE_INDEX), diagonal, diagOutputPath, jobConfig);

    // set up the rest of the job
    jobConfig.set(EigencutsKeys.BETA, Double.toString(beta));
    jobConfig.set(EigencutsKeys.EPSILON, Double.toString(epsilon));
    jobConfig.set(EigencutsKeys.DELTA, Double.toString(delta));
    jobConfig.set(EigencutsKeys.TAU, Double.toString(tau));

    Job job = new Job(jobConfig, "EigencutsSensitivityJob");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(EigencutsSensitivityNode.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(EigencutsSensitivityMapper.class);
    job.setReducerClass(EigencutsSensitivityReducer.class);

    FileInputFormat.addInputPath(job, eigenvectors);
    FileOutputFormat.setOutputPath(job, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.clustering.spectral.MatrixDiagonalizeJob.java

License:Apache License

public static Vector runJob(Path affInput, int dimensions)
        throws IOException, ClassNotFoundException, InterruptedException {

    // set up all the job tasks
    Configuration conf = new Configuration();
    Path diagOutput = new Path(affInput.getParent(), "diagonal");
    HadoopUtil.delete(conf, diagOutput);
    conf.setInt(Keys.AFFINITY_DIMENSIONS, dimensions);
    Job job = new Job(conf, "MatrixDiagonalizeJob");

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(IntDoublePairWritable.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(MatrixDiagonalizeMapper.class);
    job.setReducerClass(MatrixDiagonalizeReducer.class);

    FileInputFormat.addInputPath(job, affInput);
    FileOutputFormat.setOutputPath(job, diagOutput);

    job.setJarByClass(MatrixDiagonalizeJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }/*from   www.  j ava 2  s.  c o m*/

    // read the results back from the path
    return VectorCache.load(conf, new Path(diagOutput, "part-r-00000"));
}

From source file:org.apache.mahout.clustering.spectral.VectorMatrixMultiplicationJob.java

License:Apache License

public static DistributedRowMatrix runJob(Path markovPath, Vector diag, Path outputPath, Path tmpPath)
        throws IOException, ClassNotFoundException, InterruptedException {

    // set up the serialization of the diagonal vector
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(markovPath.toUri(), conf);
    markovPath = fs.makeQualified(markovPath);
    outputPath = fs.makeQualified(outputPath);
    Path vectorOutputPath = new Path(outputPath.getParent(), "vector");
    VectorCache.save(new IntWritable(Keys.DIAGONAL_CACHE_INDEX), diag, vectorOutputPath, conf);

    // set up the job itself
    Job job = new Job(conf, "VectorMatrixMultiplication");
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(VectorMatrixMultiplicationMapper.class);
    job.setNumReduceTasks(0);/*from   ww w. j  a  va2 s .c  o m*/

    FileInputFormat.addInputPath(job, markovPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setJarByClass(VectorMatrixMultiplicationJob.class);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

    // build the resulting DRM from the results
    return new DistributedRowMatrix(outputPath, tmpPath, diag.size(), diag.size());
}

From source file:org.apache.mahout.clustering.topdown.postprocessor.ClusterOutputPostProcessorDriver.java

License:Apache License

/**
 * Using @FileSystem rename method to move the file.
 *///from www .  ja  v  a 2  s . c  om
private static void renameFile(Writable key, FileStatus fileStatus, Configuration conf) throws IOException {
    Path path = fileStatus.getPath();
    FileSystem fileSystem = path.getFileSystem(conf);
    Path subDir = new Path(key.toString());
    Path renameTo = new Path(path.getParent(), subDir);
    fileSystem.mkdirs(renameTo);
    fileSystem.rename(path, renameTo);
}

From source file:org.apache.mahout.freqtermsets.ParallelFPStreamReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {

    super.setup(context);
    Configuration conf = context.getConfiguration();
    Parameters params = new Parameters(conf.get(PFPGrowth.PFP_PARAMETERS, ""));

    intervalStart = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
    intervalEnd = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
    windowSize = Long//from www  .  j a  v a2  s. c o  m
            .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(intervalEnd - intervalStart)));
    endTimestamp = Math.min(intervalEnd, intervalStart + windowSize - 1);

    PFPGrowth.loadEarlierFHashMaps(context, params, intervalStart, idStringMap, stringIdMap);

    maxHeapSize = Integer.valueOf(params.get(PFPGrowth.MAX_HEAPSIZE, "50"));
    minSupport = Integer.valueOf(params.get(PFPGrowth.MIN_SUPPORT, "3"));

    numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT);

    minWordsForLangDetection = params.getInt(MIN_WORDS_FOR_LANG_ID, MIN_WORDS_FOR_LANG_ID_DEFAULT);
    repeatHashTag = Boolean.parseBoolean(params.get(TokenIterator.PARAM_REPEAT_HASHTAG, "false"));

    long maxPatternLoadLag = Long.parseLong(
            params.get(PFPGrowth.PARAM_MAX_PATTERN_LOAD_LAG, PFPGrowth.DEFAULT_MAX_PATTERN_LOAD_LAG));

    Path mostRecentPath = null;
    Path outPath = new Path(params.get(PFPGrowth.OUTPUT));
    Path timeRoot = outPath.getParent().getParent();
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] otherWindows = fs.listStatus(timeRoot);
    //    List<IndexReader> earlierIndexes = Lists
    //        .<IndexReader> newArrayListWithCapacity(otherWindows.length - 1);
    for (int f = otherWindows.length - 1; f >= 0; --f) {
        Path p = otherWindows[f].getPath();
        long pathStartTime = Long.parseLong(p.getName());
        // should have used end time, but it doesn't make a difference,
        // AS LONG AS windows don't overlap
        //      long timeDifference = intervalStart - pathStartTime;
        //      if (timeDifference > 0 && timeDifference <= maxPatternLoadLag) {
        if (pathStartTime < intervalStart && pathStartTime > mostRecentTime) {
            p = fs.listStatus(p)[0].getPath();
            p = new Path(p, "index");
            if (fs.exists(p)) {
                mostRecentTime = pathStartTime;
                mostRecentPath = p;
                //          File indexDir = FileUtils.toFile(p.toUri().toURL());
                //          // FIXME: this will work only on local filesystem.. like many other parts of the code
                //          Directory fisdir = new MMapDirectory(indexDir);
                //          IndexReader fisIxReader = IndexReader.open(fisdir);
                //          earlierIndexes.add(fisIxReader);
            }
        }
    }
    if (mostRecentPath != null) {
        //    if(!earlierIndexes.isEmpty()) {
        //      fisIxMultiReader = new MultiReader(earlierIndexes.toArray(new IndexReader[0]));
        Directory fisdir = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL()));
        fisIxReader = IndexReader.open(fisdir);
        //      fisSearcher = new IndexSearcher(fisIxMultiReader);
        fisSearcher = new IndexSearcher(fisIxReader);
        fisSimilarity = new ItemSetSimilarity();
        fisSearcher.setSimilarity(fisSimilarity);

        fisQparser = new QueryParser(Version.LUCENE_36, ItemSetIndexBuilder.AssocField.ITEMSET.name, ANALYZER);
        fisQparser.setDefaultOperator(Operator.AND);

        timeWeigth = TimeWeightFunction.getDefault(params);
    }
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * Generates the fList from the serialized string representation
 * /* w  ww .  jav  a  2 s  . c o  m*/
 * @return Deserialized Feature Frequency List
 */
public static OpenObjectLongHashMap<String> readOlderCachedFLists(Configuration conf, long currWindowStart,
        TimeWeightFunction weightFunction) throws IOException {
    OpenObjectLongHashMap<String> list = new OpenObjectLongHashMap<String>();
    Path[] files = DistributedCache.getLocalCacheFiles(conf);
    if (files == null) {
        throw new IOException("Cannot read Frequency list from Distributed Cache");
    }
    for (int i = 0; i < files.length; ++i) {
        FileSystem fs = FileSystem.getLocal(conf);
        Path fListLocalPath = fs.makeQualified(files[i]);
        // Fallback if we are running locally.
        if (!fs.exists(fListLocalPath)) {
            URI[] filesURIs = DistributedCache.getCacheFiles(conf);
            if (filesURIs == null) {
                throw new IOException("Cannot read Frequency list from Distributed Cache");
            }
            fListLocalPath = new Path(filesURIs[i].getPath());
        }
        long listWindowStart = Long.parseLong(fListLocalPath.getParent().getParent().getName());
        for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath,
                true, conf)) {
            String token = record.getFirst().toString();

            list.put(token, Math.round(list.get(token)
                    + weightFunction.apply(record.getSecond().get(), listWindowStart, currWindowStart)));
        }
    }
    return list;
}