Example usage for org.apache.hadoop.fs PathFilter PathFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.fs PathFilter PathFilter.

Prototype

PathFilter

Source Link

Usage

From source file:org.apache.mahout.classifier.sgd.TrainASFEmail.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();/*from   ww  w .j  ava2  s  .c om*/
    addOutputOption();
    addOption("categories", "nc", "The number of categories to train on", true);
    addOption("cardinality", "c", "The size of the vectors to use", "100000");
    addOption("threads", "t", "The number of threads to use in the learner", "20");
    addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. "
            + "Higher values require more memory.", "5");
    if (parseArguments(args) == null) {
        return -1;
    }

    File base = new File(getInputPath().toString());

    Multiset<String> overallCounts = HashMultiset.create();
    File output = new File(getOutputPath().toString());
    output.mkdirs();
    int numCats = Integer.parseInt(getOption("categories"));
    int cardinality = Integer.parseInt(getOption("cardinality", "100000"));
    int threadCount = Integer.parseInt(getOption("threads", "20"));
    int poolSize = Integer.parseInt(getOption("poolSize", "5"));
    Dictionary asfDictionary = new Dictionary();
    AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(numCats, cardinality,
            new L1(), threadCount, poolSize);
    learningAlgorithm.setInterval(800);
    learningAlgorithm.setAveragingWindow(500);

    //We ran seq2encoded and split input already, so let's just build up the dictionary
    Configuration conf = new Configuration();
    PathFilter trainFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().contains("training");
        }
    };
    SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>(
            new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf);
    long numItems = 0;
    while (iter.hasNext()) {
        Pair<Text, VectorWritable> next = iter.next();
        asfDictionary.intern(next.getFirst().toString());
        numItems++;
    }

    System.out.println(numItems + " training files");

    SGDInfo info = new SGDInfo();

    iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST,
            trainFilter, null, true, conf);
    int k = 0;
    while (iter.hasNext()) {
        Pair<Text, VectorWritable> next = iter.next();
        String ng = next.getFirst().toString();
        int actual = asfDictionary.intern(ng);
        //we already have encoded
        learningAlgorithm.train(actual, next.getSecond().get());
        k++;
        State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest();

        SGDHelper.analyzeState(info, 0, k, best);
    }
    learningAlgorithm.close();
    //TODO: how to dissection since we aren't processing the files here
    //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts);
    System.out.println("exiting main, writing model to " + output);

    ModelSerializer.writeBinary(output + "/asf.model",
            learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0));

    List<Integer> counts = Lists.newArrayList();
    System.out.println("Word counts");
    for (String count : overallCounts.elementSet()) {
        counts.add(overallCounts.count(count));
    }
    Collections.sort(counts, Ordering.natural().reverse());
    k = 0;
    for (Integer count : counts) {
        System.out.println(k + "\t" + count);
        k++;
        if (k > 1000) {
            break;
        }
    }
    return 0;
}

From source file:org.apache.mahout.freqtermsets.PFPGrowth.java

License:Apache License

/**
 * /*from w  ww  .  j  a  v  a  2s.c  om*/
 * @param params
 *          params should contain input and output locations as a string value, the additional
 *          parameters include minSupport(3), maxHeapSize(50), numGroups(1000)
 * @throws NoSuchAlgorithmException
 * @throws ParseException
 */
public static void runPFPGrowth(Parameters params) throws IOException, InterruptedException,
        ClassNotFoundException, NoSuchAlgorithmException, ParseException {
    Configuration conf = new Configuration();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");

    long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
    long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
    long windowSize = Long
            .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime)));
    long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize)));
    endTime = Math.min(endTime, startTime + windowSize);

    int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
    String countIn = params.get(COUNT_IN);
    if (countIn == null) {
        countIn = params.get(OUTROOT); // PUT);
    }
    int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT);
    int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT);

    if (params.get(COUNT_IN) == null) {
        startParallelCounting(params, conf);
    }

    if (params.get(GROUP_FIS_IN) == null) {
        // save feature list to dcache
        // List<Pair<String, Long>> fList = readFList(params);
        // saveFList(fList, params, conf);

        int fListSize = cacheFList(params, conf, countIn, minSupport, minFr, prunePct);

        if (runMode.equals(RunningMode.BlockUpdate)) {
            fListSize = -1;
            Path timeRoot = new Path(countIn).getParent().getParent();
            FileSystem fs = FileSystem.getLocal(conf);
            final long currStartTime = startTime;
            for (FileStatus earlierWindow : fs.listStatus(timeRoot, new PathFilter() {
                @Override
                public boolean accept(Path p) {
                    // should have used end time, but it doesn't make a difference,
                    // AS LONG AS windows don't overlap
                    return Long.parseLong(p.getName()) < currStartTime;
                }
            })) {
                // TODO: At such low frequency and support, does pruning out items with less frequency
                // than minFreq cause loosing itemsets that are frequent but through a longer time frame
                cacheFList(params, conf, fs.listStatus(earlierWindow.getPath())[0].getPath().toString(),
                        minSupport, minFr, prunePct);
            }
        } else {
            // set param to control group size in MR jobs
            int numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT);
            int maxPerGroup = fListSize / numGroups;
            if (fListSize % numGroups != 0)
                maxPerGroup++;
            params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup));
        }
        // fList = null;

        startParallelFPGrowth(params, conf);
    } else {
        cacheFList(params, conf, countIn, minSupport, minFr, prunePct);
    }
    startAggregating(params, conf);

    if (runMode.equals(RunningMode.BlockUpdate)) {
        String indexDirStr;// = params.get(INDEX_OUT);
        // if (indexDirStr == null || indexDirStr.isEmpty()) {
        indexDirStr = FilenameUtils.concat(params.get(OUTPUT), "index");
        // } else {
        // indexDirStr = FilenameUtils.concat(indexDirStr, startTime);
        // indexDirStr = FilenameUtils.concat(indexDirStr, endTime);
        // }
        File indexDir = FileUtils.toFile(new URL(indexDirStr));

        // clean up
        FileUtils.deleteQuietly(indexDir);

        Path seqPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);
        Directory earlierIndex = null;

        Path timeRoot = new Path(params.get(OUTPUT)).getParent().getParent();
        FileSystem fs = FileSystem.getLocal(conf);

        long mostRecent = Long.MIN_VALUE;
        Path mostRecentPath = null;
        for (FileStatus earlierWindow : fs.listStatus(timeRoot)) {
            long earlierStart = Long.parseLong(earlierWindow.getPath().getName());
            // should have used end time, but it doesn't make a difference,
            // AS LONG AS windows don't overlap
            if (earlierStart < startTime && earlierStart > mostRecent) {
                mostRecentPath = earlierWindow.getPath();
                mostRecent = earlierStart;
            }
        }
        if (mostRecentPath != null) {
            mostRecentPath = fs.listStatus(mostRecentPath)[0].getPath();
            mostRecentPath = new Path(mostRecentPath, "index");
            // earlierIndex = new Directory[1];
            // FIXME: as with anything that involves lucene.. won't work except on a local machine
            earlierIndex = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL()));
        }
    }
    // FIXME: When we want to stream, we have to build the index of earlier window
    // ItemSetIndexBuilder.buildIndex(seqPath, indexDir,
    // startTime, Math.min(endTime, startTime + windowSize), earlierIndex);
}

From source file:org.apache.metamodel.util.HdfsDirectoryInputStream.java

License:Apache License

public HdfsDirectoryInputStream(final Path hadoopPath, final FileSystem fs) {
    _hadoopPath = hadoopPath;/* ww w  .  ja  va2s  . c  om*/
    _fs = fs;
    FileStatus[] fileStatuses;
    try {
        fileStatuses = _fs.listStatus(_hadoopPath, new PathFilter() {
            @Override
            public boolean accept(final Path path) {
                try {
                    return _fs.isFile(path);
                } catch (IOException e) {
                    return false;
                }
            }
        });
        // Natural ordering is the URL
        Arrays.sort(fileStatuses);
    } catch (IOException e) {
        fileStatuses = new FileStatus[0];
    }
    _files = fileStatuses;
}

From source file:org.apache.mrql.BinaryInputFormat.java

License:Apache License

/** collect the data from multiple sequence files at the path directory into a Bag
 * @param path the path directory//from  w  w w. j  av a2 s. c  om
 * @return a Bag that contains all data
 */
public Bag materialize(final Path path) throws IOException {
    final FileSystem fs = path.getFileSystem(Plan.conf);
    final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    });
    if (ds.length > 0)
        return new Bag(new BagIterator() {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, ds[0].getPath(), Plan.conf);
            MRContainer key = new MRContainer(new MR_int(0));
            MRContainer value = new MRContainer(new MR_int(0));
            int i = 1;

            public boolean hasNext() {
                try {
                    if (reader.next(key, value))
                        return true;
                    do {
                        if (i >= ds.length)
                            return false;
                        reader.close();
                        reader = new SequenceFile.Reader(fs, ds[i++].getPath(), Plan.conf);
                    } while (!reader.next(key, value));
                    return true;
                } catch (IOException e) {
                    throw new Error("Cannot collect values from an intermediate result");
                }
            }

            public MRData next() {
                return value.data();
            }
        });
    return new Bag();
}

From source file:org.apache.mrql.BSPMRQLFileInputFormat.java

License:Apache License

/** materialize the entire dataset into a Bag
 * @param x the DataSet in HDFS to collect values from
 * @param strip true if you want to stripout the source id (used in BSP sources)
 * @return the Bag that contains the collected values
 *///  w w w  .j a v a2 s. c o  m
public final Bag collect(final DataSet x, boolean strip) throws Exception {
    Bag res = new Bag();
    for (DataSource s : x.source)
        if (s.to_be_merged)
            res = res.union(Plan.merge(s));
        else {
            Path path = new Path(s.path);
            final FileSystem fs = path.getFileSystem(Plan.conf);
            final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
                public boolean accept(Path path) {
                    return !path.getName().startsWith("_");
                }
            });
            Bag b = new Bag();
            for (FileStatus st : ds)
                b = b.union(s.inputFormat.newInstance().materialize(st.getPath()));
            if (strip) {
                // remove source_num
                final Iterator<MRData> iter = b.iterator();
                b = new Bag(new BagIterator() {
                    public boolean hasNext() {
                        return iter.hasNext();
                    }

                    public MRData next() {
                        return ((Tuple) iter.next()).get(1);
                    }
                });
            }
            ;
            res = res.union(b);
        }
    ;
    return res;
}

From source file:org.apache.mrql.CrossProductOperation.java

License:Apache License

/** The CrossProduct physical operator (similar to block-nested loop)
 * @param mx              left mapper//from w ww  .ja v  a  2 s .com
 * @param my              right mapper
 * @param reduce_fnc      reducer
 * @param acc_fnc         optional accumulator function
 * @param zero            optional the zero value for the accumulator
 * @param X               the left source
 * @param Y               the right source (stored in distributed cache)
 * @param stop_counter    optional counter used in repeat operation
 * @return a new data source that contains the result
 */
public final static DataSet crossProduct(Tree mx, // left mapper
        Tree my, // right mapper
        Tree reduce_fnc, // reducer
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet X, // the left source
        DataSet Y, // the right source (stored in distributed cache)
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    DataSet ds = MapOperation.cMap(my, null, null, Y, "-");
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.reducer", reduce_fnc.toString());
    conf.set("mrql.mapper", mx.toString());
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    PathFilter pf = new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };
    for (DataSource p : ds.source) {
        Path path = new Path(p.path);
        for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf))
            DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration());
    }
    ;
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, crossProductMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job));
}

From source file:org.apache.mrql.HDFSFileInputStream.java

License:Apache License

private ArrayList<String> new_files() {
    try {//from   ww  w .ja v  a 2 s.com
        long ct = System.currentTimeMillis();
        Path dpath = new Path(directory);
        final FileSystem fs = dpath.getFileSystem(Plan.conf);
        final FileStatus[] ds = fs.listStatus(dpath, new PathFilter() {
            public boolean accept(Path path) {
                return !path.getName().startsWith("_") && !path.getName().endsWith(".type");
            }
        });
        ArrayList<String> s = new ArrayList<String>();
        for (FileStatus d : ds) {
            String name = d.getPath().toString();
            if (file_modification_times.get(name) == null
                    || d.getModificationTime() > file_modification_times.get(name)) {
                file_modification_times.put(name, new Long(ct));
                s.add(name);
            }
        }
        ;
        return s;
    } catch (Exception ex) {
        throw new Error("Cannot open a new file from the directory " + directory + ": " + ex);
    }
}

From source file:org.apache.mrql.MapJoinOperation.java

License:Apache License

/** The fragment-replicate join (map-side join) physical operator
 * @param probe_map_fnc    left mapper function
 * @param built_map_fnc    right mapper function
 * @param reduce_fnc       reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param probe_dataset    the map source
 * @param built_dataset    stored in distributed cache
 * @param stop_counter     optional counter used in repeat operation
 * @return a new data source that contains the result
 *///from w w w . jav a 2s .  c o m
public final static DataSet mapJoin(Tree probe_map_fnc, // left mapper function
        Tree built_map_fnc, // right mapper function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet probe_dataset, // the map source
        DataSet built_dataset, // stored in distributed cache
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    DataSet ds = MapOperation.cMap(built_map_fnc, null, null, built_dataset, "-");
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.inMap.reducer", reduce_fnc.toString());
    conf.set("mrql.probe_mapper", probe_map_fnc.toString());
    conf.set("mrql.counter", stop_counter);
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    setupSplits(new DataSet[] { probe_dataset, built_dataset }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    PathFilter pf = new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };
    for (DataSource p : ds.source) { // distribute the built dataset
        Path path = new Path(p.path);
        for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf))
            DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration());
    }
    ;
    for (DataSource p : probe_dataset.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, mapJoinMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setNumReduceTasks(0);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job));
}

From source file:org.apache.mrql.MapReducePlan.java

License:Apache License

/** The Aggregate physical operator
 * @param acc_fnc  the accumulator function from (T,T) to T
 * @param zero  the zero element of type T
 * @param S the dataset that contains the bag of values {T}
 * @return the aggregation result of type T
 *//*w w w  .  j  a  v a 2  s  . c o  m*/
public final static MRData aggregate(final Tree acc_fnc, final Tree zero, final DataSet S) throws Exception {
    MRData res = Interpreter.evalE(zero);
    Function accumulator = functional_argument(Plan.conf, acc_fnc);
    Tuple pair = new Tuple(2);
    for (DataSource s : S.source)
        if (s.inputFormat != MapReduceBinaryInputFormat.class) {
            pair.set(0, res);
            pair.set(1, aggregate(acc_fnc, zero,
                    MapOperation.cMap(Interpreter.identity_mapper, acc_fnc, zero, new DataSet(s, 0, 0), "-")));
            res = accumulator.eval(pair);
        } else {
            Path path = new Path(s.path);
            final FileSystem fs = path.getFileSystem(conf);
            final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
                public boolean accept(Path path) {
                    return !path.getName().startsWith("_");
                }
            });
            MRContainer key = new MRContainer(new MR_int(0));
            MRContainer value = new MRContainer(new MR_int(0));
            for (int i = 0; i < ds.length; i++) {
                SequenceFile.Reader reader = new SequenceFile.Reader(fs, ds[i].getPath(), conf);
                while (reader.next(key, value)) {
                    pair.set(0, res);
                    pair.set(1, value.data());
                    res = accumulator.eval(pair);
                }
                ;
                reader.close();
            }
        }
    ;
    return res;
}

From source file:org.apache.mrql.Plan.java

License:Apache License

/** merge the sorted files of the data source */
public final static Bag merge(final DataSource s) throws Exception {
    Path path = new Path(s.path);
    final FileSystem fs = path.getFileSystem(conf);
    final FileStatus[] ds = fs.listStatus(path, new PathFilter() {
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }/* w w  w  .  j  a  v  a 2s.c om*/
    });
    int dl = ds.length;
    if (dl == 0)
        return new Bag();
    Path[] paths = new Path[dl];
    for (int i = 0; i < dl; i++)
        paths[i] = ds[i].getPath();
    if (dl > Config.max_merged_streams) {
        if (Config.trace)
            System.out.println("Merging " + dl + " files");
        Path out_path = new Path(new_path(conf));
        SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new MRContainerKeyComparator(),
                MRContainer.class, MRContainer.class, conf);
        sorter.merge(paths, out_path);
        paths = new Path[1];
        paths[0] = out_path;
    }
    ;
    final int n = paths.length;
    SequenceFile.Reader[] sreaders = new SequenceFile.Reader[n];
    for (int i = 0; i < n; i++)
        sreaders[i] = new SequenceFile.Reader(fs, paths[i], conf);
    final SequenceFile.Reader[] readers = sreaders;
    final MRContainer[] keys_ = new MRContainer[n];
    final MRContainer[] values_ = new MRContainer[n];
    for (int i = 0; i < n; i++) {
        keys_[i] = new MRContainer();
        values_[i] = new MRContainer();
    }
    ;
    return new Bag(new BagIterator() {
        int min = 0;
        boolean first = true;
        final MRContainer[] keys = keys_;
        final MRContainer[] values = values_;
        final MRContainer key = new MRContainer();
        final MRContainer value = new MRContainer();

        public boolean hasNext() {
            if (first)
                try {
                    first = false;
                    for (int i = 0; i < n; i++)
                        if (readers[i].next(key, value)) {
                            keys[i].set(key.data());
                            values[i].set(value.data());
                        } else {
                            keys[i] = null;
                            readers[i].close();
                        }
                } catch (IOException e) {
                    throw new Error("Cannot merge values from an intermediate result");
                }
            ;
            min = -1;
            for (int i = 0; i < n; i++)
                if (keys[i] != null && min < 0)
                    min = i;
                else if (keys[i] != null && keys[i].compareTo(keys[min]) < 0)
                    min = i;
            return min >= 0;
        }

        public MRData next() {
            try {
                MRData res = values[min].data();
                if (readers[min].next(key, value)) {
                    keys[min].set(key.data());
                    values[min].set(value.data());
                } else {
                    keys[min] = null;
                    readers[min].close();
                }
                ;
                return res;
            } catch (IOException e) {
                throw new Error("Cannot merge values from an intermediate result");
            }
        }
    });
}