Example usage for org.apache.hadoop.fs PathFilter PathFilter

List of usage examples for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:org.apache.mahout.classifier.sgd.TrainASFEmail.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   ww  w .j  ava2  s  .c om*/
    addOutputOption();
    addOption("categories", "nc", "The number of categories to train on", true);
    addOption("cardinality", "c", "The size of the vectors to use", "100000");
    addOption("threads", "t", "The number of threads to use in the learner", "20");
    addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. "
            + "Higher values require more memory.", "5");
    if (parseArguments(args) == null) {
        return -1;
    }

    File base = new File(getInputPath().toString());

    Multiset<String> overallCounts = HashMultiset.create();
    File output = new File(getOutputPath().toString());
    output.mkdirs();
    int numCats = Integer.parseInt(getOption("categories"));
    int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
    int threadCount = Integer.parseInt(getOption("threads", "20"));
    int poolSize = Integer.parseInt(getOption("poolSize", "5"));
    Dictionary asfDictionary = new Dictionary();
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(numCats, cardinality,
            new L1(), threadCount, poolSize);
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    //We ran seq2encoded and split input already, so let's just build up the dictionary
    Configuration conf = new Configuration();
    PathFilter trainFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("training");
        }
    };
    SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>(
            new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf);
    long numItems = 0;
    while (iter.hasNext()) {
        Pair<Text, VectorWritable> next = iter.next();
        asfDictionary.intern(next.getFirst().toString());
        numItems++;
    }

    System.out.println(numItems + " training files");

    SGDInfo info = new SGDInfo();

    iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST,
            trainFilter, null, true, conf);
    int k = 0;
    while (iter.hasNext()) {
        Pair<Text, VectorWritable> next = iter.next();
        String ng = next.getFirst().toString();
        int actual = asfDictionary.intern(ng);
        //we already have encoded
        learningAlgorithm.train(actual, next.getSecond().get());
        k++;
        State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();

        SGDHelper.analyzeState(info, 0, k, best);
    }
    learningAlgorithm.close();
    //TODO: how to dissection since we aren't processing the files here
    //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts);
    System.out.println("exiting main, writing model to " + output);

    ModelSerializer.writeBinary(output + "/asf.model",
            learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));

    List<Integer> counts = Lists.newArrayList();
    System.out.println("Word counts");
    for (String count : overallCounts.elementSet()) {
        counts.add(overallCounts.count(count));
    }
    Collections.sort(counts, Ordering.natural().reverse());
    k = 0;
    for (Integer count : counts) {
        System.out.println(k + "\t" + count);
        k++;
        if (k > 1000) {
            break;
        }
    }
    return 0;
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * /*from w  ww  .  j  a  v  a  2s.c  om*/
 * @param params
 *          params should contain input and output locations as a string value, the additional
 *          parameters include minSupport(3), maxHeapSize(50), numGroups(1000)
 * @throws NoSuchAlgorithmException
 * @throws ParseException
 */
public static void runPFPGrowth(Parameters params) throws IOException, InterruptedException,
        ClassNotFoundException, NoSuchAlgorithmException, ParseException {
    Configuration conf = new Configuration();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
    long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
    long windowSize = Long
            .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime)));
    long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize)));
    endTime = Math.min(endTime, startTime + windowSize);

    int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
    String countIn = params.get(COUNT_IN);
    if (countIn == null) {
        countIn = params.get(OUTROOT); // PUT);
    }
    int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT);
    int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT);

    if (params.get(COUNT_IN) == null) {
        startParallelCounting(params, conf);
    }

    if (params.get(GROUP_FIS_IN) == null) {
        // save feature list to dcache
        // List<Pair<String, Long>> fList = readFList(params);
        // saveFList(fList, params, conf);

        int fListSize = cacheFList(params, conf, countIn, minSupport, minFr, prunePct);

        if (runMode.equals(RunningMode.BlockUpdate)) {
            fListSize = -1;
            Path timeRoot = new Path(countIn).getParent().getParent();
            FileSystem fs = FileSystem.getLocal(conf);
            final long currStartTime = startTime;
            for (FileStatus earlierWindow : fs.listStatus(timeRoot, new PathFilter() {
                @Override
                public boolean accept(Path p) {
                    // should have used end time, but it doesn't make a difference,
                    // AS LONG AS windows don't overlap
                    return Long.parseLong(p.getName()) < currStartTime;
                }
            })) {
                // TODO: At such low frequency and support, does pruning out items with less frequency
                // than minFreq cause loosing itemsets that are frequent but through a longer time frame
                cacheFList(params, conf, fs.listStatus(earlierWindow.getPath())[0].getPath().toString(),
                        minSupport, minFr, prunePct);
            }
        } else {
            // set param to control group size in MR jobs
            int numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT);
            int maxPerGroup = fListSize / numGroups;
            if (fListSize % numGroups != 0)
                maxPerGroup++;
            params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup));
        }
        // fList = null;

        startParallelFPGrowth(params, conf);
    } else {
        cacheFList(params, conf, countIn, minSupport, minFr, prunePct);
    }
    startAggregating(params, conf);

    if (runMode.equals(RunningMode.BlockUpdate)) {
        String indexDirStr;// = params.get(INDEX_OUT);
        // if (indexDirStr == null || indexDirStr.isEmpty()) {
        indexDirStr = FilenameUtils.concat(params.get(OUTPUT), "index");
        // } else {
        // indexDirStr = FilenameUtils.concat(indexDirStr, startTime);
        // indexDirStr = FilenameUtils.concat(indexDirStr, endTime);
        // }
        File indexDir = FileUtils.toFile(new URL(indexDirStr));

        // clean up
        FileUtils.deleteQuietly(indexDir);

        Path seqPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);
        Directory earlierIndex = null;

        Path timeRoot = new Path(params.get(OUTPUT)).getParent().getParent();
        FileSystem fs = FileSystem.getLocal(conf);

        long mostRecent = Long.MIN_VALUE;
        Path mostRecentPath = null;
        for (FileStatus earlierWindow : fs.listStatus(timeRoot)) {
            long earlierStart = Long.parseLong(earlierWindow.getPath().getName());
            // should have used end time, but it doesn't make a difference,
            // AS LONG AS windows don't overlap
            if (earlierStart < startTime && earlierStart > mostRecent) {
                mostRecentPath = earlierWindow.getPath();
                mostRecent = earlierStart;
            }
        }
        if (mostRecentPath != null) {
            mostRecentPath = fs.listStatus(mostRecentPath)[0].getPath();
            mostRecentPath = new Path(mostRecentPath, "index");
            // earlierIndex = new Directory[1];
            // FIXME: as with anything that involves lucene.. won't work except on a local machine
            earlierIndex = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL()));
        }
    }
    // FIXME: When we want to stream, we have to build the index of earlier window
    // ItemSetIndexBuilder.buildIndex(seqPath, indexDir,
    // startTime, Math.min(endTime, startTime + windowSize), earlierIndex);
}

From source file:org.apache.metamodel.util.HdfsDirectoryInputStream.java

License:Apache License

public HdfsDirectoryInputStream(final Path hadoopPath, final FileSystem fs) {
    _hadoopPath = hadoopPath;/* ww w  .  ja  va2s  . c  om*/
    _fs = fs;
    FileStatus[] fileStatuses;
    try {
        fileStatuses = _fs.listStatus(_hadoopPath, new PathFilter() {
            @Override
            public boolean accept(final Path path) {
                try {
                    return _fs.isFile(path);
                } catch (IOException e) {
                    return false;
                }
            }
        });
        // Natural ordering is the URL
        Arrays.sort(fileStatuses);
    } catch (IOException e) {
        fileStatuses = new FileStatus[0];
    }
    _files = fileStatuses;
}

From source file:org.apache.mrql.BinaryInputFormat.java

License:Apache License

/** collect the data from multiple sequence files at the path directory into a Bag
 * @param path the path directory//from  w  w w. j  av a2 s. c  om
 * @return a Bag that contains all data
 */
public Bag materialize(final Path path) throws IOException {
    final FileSystem fs = path.getFileSystem(Plan.conf);
    final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    });
    if (ds.length > 0)
        return new Bag(new BagIterator() {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, ds[0].getPath(), Plan.conf);
            MRContainer key = new MRContainer(new MR_int(0));
            MRContainer value = new MRContainer(new MR_int(0));
            int i = 1;

            public boolean hasNext() {
                try {
                    if (reader.next(key, value))
                        return true;
                    do {
                        if (i >= ds.length)
                            return false;
                        reader.close();
                        reader = new SequenceFile.Reader(fs, ds[i++].getPath(), Plan.conf);
                    } while (!reader.next(key, value));
                    return true;
                } catch (IOException e) {
                    throw new Error("Cannot collect values from an intermediate result");
                }
            }

            public MRData next() {
                return value.data();
            }
        });
    return new Bag();
}

From source file:org.apache.mrql.BSPMRQLFileInputFormat.java

License:Apache License

/** materialize the entire dataset into a Bag
 * @param x the DataSet in HDFS to collect values from
 * @param strip true if you want to stripout the source id (used in BSP sources)
 * @return the Bag that contains the collected values
 *///  w w w  .j a v a2 s. c o  m
public final Bag collect(final DataSet x, boolean strip) throws Exception {
    Bag res = new Bag();
    for (DataSource s : x.source)
        if (s.to_be_merged)
            res = res.union(Plan.merge(s));
        else {
            Path path = new Path(s.path);
            final FileSystem fs = path.getFileSystem(Plan.conf);
            final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
                public boolean accept(Path path) {
                    return !path.getName().startsWith("_");
                }
            });
            Bag b = new Bag();
            for (FileStatus st : ds)
                b = b.union(s.inputFormat.newInstance().materialize(st.getPath()));
            if (strip) {
                // remove source_num
                final Iterator<MRData> iter = b.iterator();
                b = new Bag(new BagIterator() {
                    public boolean hasNext() {
                        return iter.hasNext();
                    }

                    public MRData next() {
                        return ((Tuple) iter.next()).get(1);
                    }
                });
            }
            ;
            res = res.union(b);
        }
    ;
    return res;
}

From source file:org.apache.mrql.CrossProductOperation.java

License:Apache License

/** The CrossProduct physical operator (similar to block-nested loop)
 * @param mx              left mapper//from w ww  .ja v  a  2 s .com
 * @param my              right mapper
 * @param reduce_fnc      reducer
 * @param acc_fnc         optional accumulator function
 * @param zero            optional the zero value for the accumulator
 * @param X               the left source
 * @param Y               the right source (stored in distributed cache)
 * @param stop_counter    optional counter used in repeat operation
 * @return a new data source that contains the result
 */
public final static DataSet crossProduct(Tree mx, // left mapper
        Tree my, // right mapper
        Tree reduce_fnc, // reducer
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet X, // the left source
        DataSet Y, // the right source (stored in distributed cache)
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    DataSet ds = MapOperation.cMap(my, null, null, Y, "-");
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.reducer", reduce_fnc.toString());
    conf.set("mrql.mapper", mx.toString());
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    PathFilter pf = new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };
    for (DataSource p : ds.source) {
        Path path = new Path(p.path);
        for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf))
            DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration());
    }
    ;
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, crossProductMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job));
}

From source file:org.apache.mrql.HDFSFileInputStream.java

License:Apache License

private ArrayList<String> new_files() {
    try {//from   ww  w .ja v  a 2 s.com
        long ct = System.currentTimeMillis();
        Path dpath = new Path(directory);
        final FileSystem fs = dpath.getFileSystem(Plan.conf);
        final FileStatus[] ds = fs.listStatus(dpath, new PathFilter() {
            public boolean accept(Path path) {
                return !path.getName().startsWith("_") && !path.getName().endsWith(".type");
            }
        });
        ArrayList<String> s = new ArrayList<String>();
        for (FileStatus d : ds) {
            String name = d.getPath().toString();
            if (file_modification_times.get(name) == null
                    || d.getModificationTime() > file_modification_times.get(name)) {
                file_modification_times.put(name, new Long(ct));
                s.add(name);
            }
        }
        ;
        return s;
    } catch (Exception ex) {
        throw new Error("Cannot open a new file from the directory " + directory + ": " + ex);
    }
}

From source file:org.apache.mrql.MapJoinOperation.java

License:Apache License

/** The fragment-replicate join (map-side join) physical operator
 * @param probe_map_fnc    left mapper function
 * @param built_map_fnc    right mapper function
 * @param reduce_fnc       reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param probe_dataset    the map source
 * @param built_dataset    stored in distributed cache
 * @param stop_counter     optional counter used in repeat operation
 * @return a new data source that contains the result
 *///from w w w . jav a 2s .  c o m
public final static DataSet mapJoin(Tree probe_map_fnc, // left mapper function
        Tree built_map_fnc, // right mapper function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet probe_dataset, // the map source
        DataSet built_dataset, // stored in distributed cache
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    DataSet ds = MapOperation.cMap(built_map_fnc, null, null, built_dataset, "-");
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.inMap.reducer", reduce_fnc.toString());
    conf.set("mrql.probe_mapper", probe_map_fnc.toString());
    conf.set("mrql.counter", stop_counter);
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    setupSplits(new DataSet[] { probe_dataset, built_dataset }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    PathFilter pf = new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };
    for (DataSource p : ds.source) { // distribute the built dataset
        Path path = new Path(p.path);
        for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf))
            DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration());
    }
    ;
    for (DataSource p : probe_dataset.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, mapJoinMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job));
}

From source file:org.apache.mrql.MapReducePlan.java

License:Apache License

/** The Aggregate physical operator
 * @param acc_fnc  the accumulator function from (T,T) to T
 * @param zero  the zero element of type T
 * @param S the dataset that contains the bag of values {T}
 * @return the aggregation result of type T
 *//*w w w  .  j  a  v a 2  s  . c o  m*/
public final static MRData aggregate(final Tree acc_fnc, final Tree zero, final DataSet S) throws Exception {
    MRData res = Interpreter.evalE(zero);
    Function accumulator = functional_argument(Plan.conf, acc_fnc);
    Tuple pair = new Tuple(2);
    for (DataSource s : S.source)
        if (s.inputFormat != MapReduceBinaryInputFormat.class) {
            pair.set(0, res);
            pair.set(1, aggregate(acc_fnc, zero,
                    MapOperation.cMap(Interpreter.identity_mapper, acc_fnc, zero, new DataSet(s, 0, 0), "-")));
            res = accumulator.eval(pair);
        } else {
            Path path = new Path(s.path);
            final FileSystem fs = path.getFileSystem(conf);
            final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
                public boolean accept(Path path) {
                    return !path.getName().startsWith("_");
                }
            });
            MRContainer key = new MRContainer(new MR_int(0));
            MRContainer value = new MRContainer(new MR_int(0));
            for (int i = 0; i < ds.length; i++) {
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, ds[i].getPath(), conf);
                while (reader.next(key, value)) {
                    pair.set(0, res);
                    pair.set(1, value.data());
                    res = accumulator.eval(pair);
                }
                ;
                reader.close();
            }
        }
    ;
    return res;
}

From source file:org.apache.mrql.Plan.java

License:Apache License

/** merge the sorted files of the data source */
public final static Bag merge(final DataSource s) throws Exception {
    Path path = new Path(s.path);
    final FileSystem fs = path.getFileSystem(conf);
    final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }/* w w  w  .  j  a  v  a 2s.c om*/
    });
    int dl = ds.length;
    if (dl == 0)
        return new Bag();
    Path[] paths = new Path[dl];
    for (int i = 0; i < dl; i++)
        paths[i] = ds[i].getPath();
    if (dl > Config.max_merged_streams) {
        if (Config.trace)
            System.out.println("Merging " + dl + " files");
        Path out_path = new Path(new_path(conf));
        SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new MRContainerKeyComparator(),
                MRContainer.class, MRContainer.class, conf);
        sorter.merge(paths, out_path);
        paths = new Path[1];
        paths[0] = out_path;
    }
    ;
    final int n = paths.length;
    SequenceFile.Reader[] sreaders = new SequenceFile.Reader[n];
    for (int i = 0; i < n; i++)
        sreaders[i] = new SequenceFile.Reader(fs, paths[i], conf);
    final SequenceFile.Reader[] readers = sreaders;
    final MRContainer[] keys_ = new MRContainer[n];
    final MRContainer[] values_ = new MRContainer[n];
    for (int i = 0; i < n; i++) {
        keys_[i] = new MRContainer();
        values_[i] = new MRContainer();
    }
    ;
    return new Bag(new BagIterator() {
        int min = 0;
        boolean first = true;
        final MRContainer[] keys = keys_;
        final MRContainer[] values = values_;
        final MRContainer key = new MRContainer();
        final MRContainer value = new MRContainer();

        public boolean hasNext() {
            if (first)
                try {
                    first = false;
                    for (int i = 0; i < n; i++)
                        if (readers[i].next(key, value)) {
                            keys[i].set(key.data());
                            values[i].set(value.data());
                        } else {
                            keys[i] = null;
                            readers[i].close();
                        }
                } catch (IOException e) {
                    throw new Error("Cannot merge values from an intermediate result");
                }
            ;
            min = -1;
            for (int i = 0; i < n; i++)
                if (keys[i] != null && min < 0)
                    min = i;
                else if (keys[i] != null && keys[i].compareTo(keys[min]) < 0)
                    min = i;
            return min >= 0;
        }

        public MRData next() {
            try {
                MRData res = values[min].data();
                if (readers[min].next(key, value)) {
                    keys[min].set(key.data());
                    values[min].set(value.data());
                } else {
                    keys[min] = null;
                    readers[min].close();
                }
                ;
                return res;
            } catch (IOException e) {
                throw new Error("Cannot merge values from an intermediate result");
            }
        }
    });
}