List of usage examples for org.apache.hadoop.fs PathFilter PathFilter
PathFilter
From source file:org.apache.mahout.classifier.sgd.TrainASFEmail.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w .j ava2 s .c om*/ addOutputOption(); addOption("categories", "nc", "The number of categories to train on", true); addOption("cardinality", "c", "The size of the vectors to use", "100000"); addOption("threads", "t", "The number of threads to use in the learner", "20"); addOption("poolSize", "p", "The number of CrossFoldLearners to use in the AdaptiveLogisticRegression. " + "Higher values require more memory.", "5"); if (parseArguments(args) == null) { return -1; } File base = new File(getInputPath().toString()); Multiset<String> overallCounts = HashMultiset.create(); File output = new File(getOutputPath().toString()); output.mkdirs(); int numCats = Integer.parseInt(getOption("categories")); int cardinality = Integer.parseInt(getOption("cardinality", "100000")); int threadCount = Integer.parseInt(getOption("threads", "20")); int poolSize = Integer.parseInt(getOption("poolSize", "5")); Dictionary asfDictionary = new Dictionary(); AdaptiveLogisticRegression learningAlgorithm = new AdaptiveLogisticRegression(numCats, cardinality, new L1(), threadCount, poolSize); learningAlgorithm.setInterval(800); learningAlgorithm.setAveragingWindow(500); //We ran seq2encoded and split input already, so let's just build up the dictionary Configuration conf = new Configuration(); PathFilter trainFilter = new PathFilter() { @Override public boolean accept(Path path) { return path.getName().contains("training"); } }; SequenceFileDirIterator<Text, VectorWritable> iter = new SequenceFileDirIterator<Text, VectorWritable>( new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf); long numItems = 0; while (iter.hasNext()) { Pair<Text, VectorWritable> next = iter.next(); asfDictionary.intern(next.getFirst().toString()); numItems++; } System.out.println(numItems + " training files"); SGDInfo info = new SGDInfo(); iter = new SequenceFileDirIterator<Text, VectorWritable>(new Path(base.toString()), PathType.LIST, trainFilter, null, true, conf); int k = 0; while (iter.hasNext()) { Pair<Text, VectorWritable> next = iter.next(); String ng = next.getFirst().toString(); int actual = asfDictionary.intern(ng); //we already have encoded learningAlgorithm.train(actual, next.getSecond().get()); k++; State<AdaptiveLogisticRegression.Wrapper, CrossFoldLearner> best = learningAlgorithm.getBest(); SGDHelper.analyzeState(info, 0, k, best); } learningAlgorithm.close(); //TODO: how to dissection since we aren't processing the files here //SGDHelper.dissect(leakType, asfDictionary, learningAlgorithm, files, overallCounts); System.out.println("exiting main, writing model to " + output); ModelSerializer.writeBinary(output + "/asf.model", learningAlgorithm.getBest().getPayload().getLearner().getModels().get(0)); List<Integer> counts = Lists.newArrayList(); System.out.println("Word counts"); for (String count : overallCounts.elementSet()) { counts.add(overallCounts.count(count)); } Collections.sort(counts, Ordering.natural().reverse()); k = 0; for (Integer count : counts) { System.out.println(k + "\t" + count); k++; if (k > 1000) { break; } } return 0; }
From source file:org.apache.mahout.freqtermsets.PFPGrowth.java
License:Apache License
/** * /*from w ww . j a v a 2s.c om*/ * @param params * params should contain input and output locations as a string value, the additional * parameters include minSupport(3), maxHeapSize(50), numGroups(1000) * @throws NoSuchAlgorithmException * @throws ParseException */ public static void runPFPGrowth(Parameters params) throws IOException, InterruptedException, ClassNotFoundException, NoSuchAlgorithmException, ParseException { Configuration conf = new Configuration(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START)); long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END)); long windowSize = Long .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime))); long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize))); endTime = Math.min(endTime, startTime + windowSize); int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3")); String countIn = params.get(COUNT_IN); if (countIn == null) { countIn = params.get(OUTROOT); // PUT); } int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT); int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT); if (params.get(COUNT_IN) == null) { startParallelCounting(params, conf); } if (params.get(GROUP_FIS_IN) == null) { // save feature list to dcache // List<Pair<String, Long>> fList = readFList(params); // saveFList(fList, params, conf); int fListSize = cacheFList(params, conf, countIn, minSupport, minFr, prunePct); if (runMode.equals(RunningMode.BlockUpdate)) { fListSize = -1; Path timeRoot = new Path(countIn).getParent().getParent(); FileSystem fs = FileSystem.getLocal(conf); final long currStartTime = startTime; for (FileStatus earlierWindow : fs.listStatus(timeRoot, new PathFilter() { @Override public boolean accept(Path p) { // should have used end time, but it doesn't make a difference, // AS LONG AS windows don't overlap return Long.parseLong(p.getName()) < currStartTime; } })) { // TODO: At such low frequency and support, does pruning out items with less frequency // than minFreq cause loosing itemsets that are frequent but through a longer time frame cacheFList(params, conf, fs.listStatus(earlierWindow.getPath())[0].getPath().toString(), minSupport, minFr, prunePct); } } else { // set param to control group size in MR jobs int numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT); int maxPerGroup = fListSize / numGroups; if (fListSize % numGroups != 0) maxPerGroup++; params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup)); } // fList = null; startParallelFPGrowth(params, conf); } else { cacheFList(params, conf, countIn, minSupport, minFr, prunePct); } startAggregating(params, conf); if (runMode.equals(RunningMode.BlockUpdate)) { String indexDirStr;// = params.get(INDEX_OUT); // if (indexDirStr == null || indexDirStr.isEmpty()) { indexDirStr = FilenameUtils.concat(params.get(OUTPUT), "index"); // } else { // indexDirStr = FilenameUtils.concat(indexDirStr, startTime); // indexDirStr = FilenameUtils.concat(indexDirStr, endTime); // } File indexDir = FileUtils.toFile(new URL(indexDirStr)); // clean up FileUtils.deleteQuietly(indexDir); Path seqPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS); Directory earlierIndex = null; Path timeRoot = new Path(params.get(OUTPUT)).getParent().getParent(); FileSystem fs = FileSystem.getLocal(conf); long mostRecent = Long.MIN_VALUE; Path mostRecentPath = null; for (FileStatus earlierWindow : fs.listStatus(timeRoot)) { long earlierStart = Long.parseLong(earlierWindow.getPath().getName()); // should have used end time, but it doesn't make a difference, // AS LONG AS windows don't overlap if (earlierStart < startTime && earlierStart > mostRecent) { mostRecentPath = earlierWindow.getPath(); mostRecent = earlierStart; } } if (mostRecentPath != null) { mostRecentPath = fs.listStatus(mostRecentPath)[0].getPath(); mostRecentPath = new Path(mostRecentPath, "index"); // earlierIndex = new Directory[1]; // FIXME: as with anything that involves lucene.. won't work except on a local machine earlierIndex = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL())); } } // FIXME: When we want to stream, we have to build the index of earlier window // ItemSetIndexBuilder.buildIndex(seqPath, indexDir, // startTime, Math.min(endTime, startTime + windowSize), earlierIndex); }
From source file:org.apache.metamodel.util.HdfsDirectoryInputStream.java
License:Apache License
public HdfsDirectoryInputStream(final Path hadoopPath, final FileSystem fs) { _hadoopPath = hadoopPath;/* ww w . ja va2s . c om*/ _fs = fs; FileStatus[] fileStatuses; try { fileStatuses = _fs.listStatus(_hadoopPath, new PathFilter() { @Override public boolean accept(final Path path) { try { return _fs.isFile(path); } catch (IOException e) { return false; } } }); // Natural ordering is the URL Arrays.sort(fileStatuses); } catch (IOException e) { fileStatuses = new FileStatus[0]; } _files = fileStatuses; }
From source file:org.apache.mrql.BinaryInputFormat.java
License:Apache License
/** collect the data from multiple sequence files at the path directory into a Bag * @param path the path directory//from w w w. j av a2 s. c om * @return a Bag that contains all data */ public Bag materialize(final Path path) throws IOException { final FileSystem fs = path.getFileSystem(Plan.conf); final FileStatus[] ds = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }); if (ds.length > 0) return new Bag(new BagIterator() { SequenceFile.Reader reader = new SequenceFile.Reader(fs, ds[0].getPath(), Plan.conf); MRContainer key = new MRContainer(new MR_int(0)); MRContainer value = new MRContainer(new MR_int(0)); int i = 1; public boolean hasNext() { try { if (reader.next(key, value)) return true; do { if (i >= ds.length) return false; reader.close(); reader = new SequenceFile.Reader(fs, ds[i++].getPath(), Plan.conf); } while (!reader.next(key, value)); return true; } catch (IOException e) { throw new Error("Cannot collect values from an intermediate result"); } } public MRData next() { return value.data(); } }); return new Bag(); }
From source file:org.apache.mrql.BSPMRQLFileInputFormat.java
License:Apache License
/** materialize the entire dataset into a Bag * @param x the DataSet in HDFS to collect values from * @param strip true if you want to stripout the source id (used in BSP sources) * @return the Bag that contains the collected values */// w w w .j a v a2 s. c o m public final Bag collect(final DataSet x, boolean strip) throws Exception { Bag res = new Bag(); for (DataSource s : x.source) if (s.to_be_merged) res = res.union(Plan.merge(s)); else { Path path = new Path(s.path); final FileSystem fs = path.getFileSystem(Plan.conf); final FileStatus[] ds = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }); Bag b = new Bag(); for (FileStatus st : ds) b = b.union(s.inputFormat.newInstance().materialize(st.getPath())); if (strip) { // remove source_num final Iterator<MRData> iter = b.iterator(); b = new Bag(new BagIterator() { public boolean hasNext() { return iter.hasNext(); } public MRData next() { return ((Tuple) iter.next()).get(1); } }); } ; res = res.union(b); } ; return res; }
From source file:org.apache.mrql.CrossProductOperation.java
License:Apache License
/** The CrossProduct physical operator (similar to block-nested loop) * @param mx left mapper//from w ww .ja v a 2 s .com * @param my right mapper * @param reduce_fnc reducer * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X the left source * @param Y the right source (stored in distributed cache) * @param stop_counter optional counter used in repeat operation * @return a new data source that contains the result */ public final static DataSet crossProduct(Tree mx, // left mapper Tree my, // right mapper Tree reduce_fnc, // reducer Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // the left source DataSet Y, // the right source (stored in distributed cache) String stop_counter) // optional counter used in repeat operation throws Exception { DataSet ds = MapOperation.cMap(my, null, null, Y, "-"); conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.mapper", mx.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); PathFilter pf = new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; for (DataSource p : ds.source) { Path path = new Path(p.path); for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf)) DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration()); } ; for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, crossProductMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setNumReduceTasks(0); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job)); }
From source file:org.apache.mrql.HDFSFileInputStream.java
License:Apache License
private ArrayList<String> new_files() { try {//from ww w .ja v a 2 s.com long ct = System.currentTimeMillis(); Path dpath = new Path(directory); final FileSystem fs = dpath.getFileSystem(Plan.conf); final FileStatus[] ds = fs.listStatus(dpath, new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_") && !path.getName().endsWith(".type"); } }); ArrayList<String> s = new ArrayList<String>(); for (FileStatus d : ds) { String name = d.getPath().toString(); if (file_modification_times.get(name) == null || d.getModificationTime() > file_modification_times.get(name)) { file_modification_times.put(name, new Long(ct)); s.add(name); } } ; return s; } catch (Exception ex) { throw new Error("Cannot open a new file from the directory " + directory + ": " + ex); } }
From source file:org.apache.mrql.MapJoinOperation.java
License:Apache License
/** The fragment-replicate join (map-side join) physical operator * @param probe_map_fnc left mapper function * @param built_map_fnc right mapper function * @param reduce_fnc reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param probe_dataset the map source * @param built_dataset stored in distributed cache * @param stop_counter optional counter used in repeat operation * @return a new data source that contains the result *///from w w w . jav a 2s . c o m public final static DataSet mapJoin(Tree probe_map_fnc, // left mapper function Tree built_map_fnc, // right mapper function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet probe_dataset, // the map source DataSet built_dataset, // stored in distributed cache String stop_counter) // optional counter used in repeat operation throws Exception { DataSet ds = MapOperation.cMap(built_map_fnc, null, null, built_dataset, "-"); conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.inMap.reducer", reduce_fnc.toString()); conf.set("mrql.probe_mapper", probe_map_fnc.toString()); conf.set("mrql.counter", stop_counter); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); setupSplits(new DataSet[] { probe_dataset, built_dataset }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); PathFilter pf = new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; for (DataSource p : ds.source) { // distribute the built dataset Path path = new Path(p.path); for (FileStatus s : path.getFileSystem(conf).listStatus(path, pf)) DistributedCache.addCacheFile(s.getPath().toUri(), job.getConfiguration()); } ; for (DataSource p : probe_dataset.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, mapJoinMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setNumReduceTasks(0); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); return new DataSet(new BinaryDataSource(newpath, conf), c, outputRecords(job)); }
From source file:org.apache.mrql.MapReducePlan.java
License:Apache License
/** The Aggregate physical operator * @param acc_fnc the accumulator function from (T,T) to T * @param zero the zero element of type T * @param S the dataset that contains the bag of values {T} * @return the aggregation result of type T *//*w w w . j a v a 2 s . c o m*/ public final static MRData aggregate(final Tree acc_fnc, final Tree zero, final DataSet S) throws Exception { MRData res = Interpreter.evalE(zero); Function accumulator = functional_argument(Plan.conf, acc_fnc); Tuple pair = new Tuple(2); for (DataSource s : S.source) if (s.inputFormat != MapReduceBinaryInputFormat.class) { pair.set(0, res); pair.set(1, aggregate(acc_fnc, zero, MapOperation.cMap(Interpreter.identity_mapper, acc_fnc, zero, new DataSet(s, 0, 0), "-"))); res = accumulator.eval(pair); } else { Path path = new Path(s.path); final FileSystem fs = path.getFileSystem(conf); final FileStatus[] ds = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); } }); MRContainer key = new MRContainer(new MR_int(0)); MRContainer value = new MRContainer(new MR_int(0)); for (int i = 0; i < ds.length; i++) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, ds[i].getPath(), conf); while (reader.next(key, value)) { pair.set(0, res); pair.set(1, value.data()); res = accumulator.eval(pair); } ; reader.close(); } } ; return res; }
From source file:org.apache.mrql.Plan.java
License:Apache License
/** merge the sorted files of the data source */ public final static Bag merge(final DataSource s) throws Exception { Path path = new Path(s.path); final FileSystem fs = path.getFileSystem(conf); final FileStatus[] ds = fs.listStatus(path, new PathFilter() { public boolean accept(Path path) { return !path.getName().startsWith("_"); }/* w w w . j a v a 2s.c om*/ }); int dl = ds.length; if (dl == 0) return new Bag(); Path[] paths = new Path[dl]; for (int i = 0; i < dl; i++) paths[i] = ds[i].getPath(); if (dl > Config.max_merged_streams) { if (Config.trace) System.out.println("Merging " + dl + " files"); Path out_path = new Path(new_path(conf)); SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new MRContainerKeyComparator(), MRContainer.class, MRContainer.class, conf); sorter.merge(paths, out_path); paths = new Path[1]; paths[0] = out_path; } ; final int n = paths.length; SequenceFile.Reader[] sreaders = new SequenceFile.Reader[n]; for (int i = 0; i < n; i++) sreaders[i] = new SequenceFile.Reader(fs, paths[i], conf); final SequenceFile.Reader[] readers = sreaders; final MRContainer[] keys_ = new MRContainer[n]; final MRContainer[] values_ = new MRContainer[n]; for (int i = 0; i < n; i++) { keys_[i] = new MRContainer(); values_[i] = new MRContainer(); } ; return new Bag(new BagIterator() { int min = 0; boolean first = true; final MRContainer[] keys = keys_; final MRContainer[] values = values_; final MRContainer key = new MRContainer(); final MRContainer value = new MRContainer(); public boolean hasNext() { if (first) try { first = false; for (int i = 0; i < n; i++) if (readers[i].next(key, value)) { keys[i].set(key.data()); values[i].set(value.data()); } else { keys[i] = null; readers[i].close(); } } catch (IOException e) { throw new Error("Cannot merge values from an intermediate result"); } ; min = -1; for (int i = 0; i < n; i++) if (keys[i] != null && min < 0) min = i; else if (keys[i] != null && keys[i].compareTo(keys[min]) < 0) min = i; return min >= 0; } public MRData next() { try { MRData res = values[min].data(); if (readers[min].next(key, value)) { keys[min].set(key.data()); values[min].set(value.data()); } else { keys[min] = null; readers[min].close(); } ; return res; } catch (IOException e) { throw new Error("Cannot merge values from an intermediate result"); } } }); }