List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static double readSigmaJSigmaK(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> weightSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { if (weightSum.size() > 1) { throw new IOException("Incorrect Sum File"); } else if (key.stringAt(0).equals(BayesConstants.TOTAL_SUM)) { weightSum.put(BayesConstants.TOTAL_SUM, value.get()); }/*from w w w . j a v a 2s . c om*/ } } return weightSum.get(BayesConstants.TOTAL_SUM); }
From source file:org.apache.mahout.classifier.bayes.io.SequenceFileModelReader.java
License:Apache License
public static double readVocabCount(FileSystem fs, Path pathPattern, Configuration conf) throws IOException { Map<String, Double> weightSum = new HashMap<String, Double>(); StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); FileStatus[] outputFiles = fs.globStatus(pathPattern); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { if (weightSum.size() > 1) { throw new IOException("Incorrect vocabCount File"); }// w w w . ja v a 2 s.c o m if (key.stringAt(0).equals(BayesConstants.FEATURE_SET_SIZE)) { weightSum.put(BayesConstants.FEATURE_SET_SIZE, value.get()); } } } return weightSum.get(BayesConstants.FEATURE_SET_SIZE); }
From source file:org.apache.mahout.classifier.bayes.mapreduce.bayes.BayesClassifierDriver.java
License:Apache License
public static ConfusionMatrix readResult(FileSystem fs, Path pathPattern, Configuration conf, Parameters params) throws IOException { StringTuple key = new StringTuple(); DoubleWritable value = new DoubleWritable(); String defaultLabel = params.get("defaultCat"); FileStatus[] outputFiles = fs.globStatus(pathPattern); Map<String, Map<String, Integer>> confusionMatrix = new HashMap<String, Map<String, Integer>>(); for (FileStatus fileStatus : outputFiles) { Path path = fileStatus.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); while (reader.next(key, value)) { String correctLabel = key.stringAt(1); String classifiedLabel = key.stringAt(2); Map<String, Integer> rowMatrix = confusionMatrix.get(correctLabel); if (rowMatrix == null) { rowMatrix = new HashMap<String, Integer>(); }// w w w . ja v a 2s . co m Integer count = Double.valueOf(value.get()).intValue(); rowMatrix.put(classifiedLabel, count); confusionMatrix.put(correctLabel, rowMatrix); } } ConfusionMatrix matrix = new ConfusionMatrix(confusionMatrix.keySet(), defaultLabel); for (Map.Entry<String, Map<String, Integer>> correctLabelSet : confusionMatrix.entrySet()) { Map<String, Integer> rowMatrix = correctLabelSet.getValue(); for (Map.Entry<String, Integer> classifiedLabelSet : rowMatrix.entrySet()) { matrix.addInstance(correctLabelSet.getKey(), classifiedLabelSet.getKey()); matrix.putCount(correctLabelSet.getKey(), classifiedLabelSet.getKey(), classifiedLabelSet.getValue()); } } return matrix; }
From source file:org.apache.mahout.clustering.evaluation.RepresentativePointsDriver.java
License:Apache License
private static void writeInitialState(Path output, Path clustersIn) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); for (FileStatus dir : fs.globStatus(clustersIn)) { Path inPath = dir.getPath(); for (FileStatus part : fs.listStatus(inPath, PathFilters.logsCRCFilter())) { Path inPart = part.getPath(); Path path = new Path(output, inPart.getName()); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class); try { for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(inPart, true, conf)) {// www . j av a 2 s.c o m Cluster cluster = clusterWritable.getValue(); if (log.isDebugEnabled()) { log.debug("C-{}: {}", cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null)); } writer.append(new IntWritable(cluster.getId()), new VectorWritable(cluster.getCenter())); } } finally { Closeables.close(writer, false); } } } }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
static LDAState createState(Configuration job) throws IOException { String statePath = job.get(STATE_IN_KEY); int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY)); int numWords = Integer.parseInt(job.get(NUM_WORDS_KEY)); double topicSmoothing = Double.parseDouble(job.get(TOPIC_SMOOTHING_KEY)); Path dir = new Path(statePath); FileSystem fs = dir.getFileSystem(job); DenseMatrix pWgT = new DenseMatrix(numTopics, numWords); double[] logTotals = new double[numTopics]; double ll = 0.0; IntPairWritable key = new IntPairWritable(); DoubleWritable value = new DoubleWritable(); for (FileStatus status : fs.globStatus(new Path(dir, "part-*"))) { Path path = status.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job); while (reader.next(key, value)) { int topic = key.getFirst(); int word = key.getSecond(); if (word == TOPIC_SUM_KEY) { logTotals[topic] = value.get(); if (Double.isInfinite(value.get())) { throw new IllegalArgumentException(); }//from ww w.j a v a 2 s . c o m } else if (topic == LOG_LIKELIHOOD_KEY) { ll = value.get(); } else { if (!((topic >= 0) && (word >= 0))) { throw new IllegalArgumentException(topic + " " + word); } if (pWgT.getQuick(topic, word) != 0.0) { throw new IllegalArgumentException(); } pWgT.setQuick(topic, word, value.get()); if (Double.isInfinite(pWgT.getQuick(topic, word))) { throw new IllegalArgumentException(); } } } reader.close(); } return new LDAState(numTopics, numWords, topicSmoothing, pWgT, logTotals, ll); }
From source file:org.apache.mahout.clustering.lda.LDADriver.java
License:Apache License
private double findLL(Path statePath, Configuration job) throws IOException { FileSystem fs = statePath.getFileSystem(job); double ll = 0.0; IntPairWritable key = new IntPairWritable(); DoubleWritable value = new DoubleWritable(); for (FileStatus status : fs.globStatus(new Path(statePath, "part-*"))) { Path path = status.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job); while (reader.next(key, value)) { if (key.getFirst() == LOG_LIKELIHOOD_KEY) { ll = value.get();/*from ww w . ja va 2s . c o m*/ break; } } reader.close(); } return ll; }
From source file:org.apache.mahout.common.HadoopUtil.java
License:Apache License
public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, Configuration conf) throws IOException { FileStatus[] statuses;//www . j a v a 2s . c om FileSystem fs = path.getFileSystem(conf); if (filter == null) { statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path); } else { statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter); } if (ordering != null) { Arrays.sort(statuses, ordering); } return statuses; }
From source file:org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator.java
License:Apache License
/** * Constructor that uses either {@link FileSystem#listStatus(Path)} or * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over * (depending on pathType parameter).// www . j a v a2 s . c om */ public SequenceFileDirValueIterator(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, boolean reuseKeyValueInstances, Configuration conf) throws IOException { FileStatus[] statuses; FileSystem fs = FileSystem.get(path.toUri(), conf); if (filter == null) { statuses = pathType == PathType.GLOB ? fs.globStatus(path) : fs.listStatus(path); } else { statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : fs.listStatus(path, filter); } iterators = Lists.newArrayList(); init(statuses, ordering, reuseKeyValueInstances, conf); }
From source file:org.apache.mahout.fpm.bigfim.BigFIMDriver.java
License:Apache License
private static void startMining(String outputDir, Config config) throws IOException, ClassNotFoundException, InterruptedException { String inputFilesDir = outputDir + separator + "pg" + separator; String outputFile = outputDir + separator + OFis; System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile); Configuration conf = new Configuration(); setConfigurationValues(conf, config); Job job = new Job(conf, "Start Mining"); job.setJarByClass(BigFIMDriver.class); job.setOutputKeyClass(Text.class); if (config.getWriteSets()) { job.setOutputValueClass(Text.class); job.setMapperClass(EclatMinerMapper.class); job.setReducerClass(EclatMinerReducer.class); } else {/* ww w.j a v a 2 s . c o m*/ job.setOutputValueClass(LongWritable.class); job.setMapperClass(EclatMinerMapperSetCount.class); job.setReducerClass(EclatMinerReducerSetCount.class); } job.setInputFormatClass(NoSplitSequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); List<Path> inputPaths = new ArrayList<Path>(); FileSystem fs = FileSystem.get(conf); FileStatus[] listStatus = fs.globStatus(new Path(inputFilesDir + "bucket*")); fs.close(); for (FileStatus fstat : listStatus) { inputPaths.add(fstat.getPath()); } FileInputFormat.setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()])); FileOutputFormat.setOutputPath(job, new Path(outputFile)); long start = System.currentTimeMillis(); job.waitForCompletion(true); long end = System.currentTimeMillis(); System.out.println("Job Mining took " + (end - start) / 1000 + "s"); }
From source file:org.apache.mahout.fpm.disteclat.DistEclatDriver.java
License:Apache License
/** * Starts the third MapReduce cycle. Each mapper reads the prefix groups assigned to it and computes the collection of * closed sets. All information is reported to the reducer which finally writes the output to disk. * //from ww w . j a v a 2 s.c om * * @param inputDir * @param outputDir * @param config * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException * @throws URISyntaxException */ private static void startMining(String inputDir, String outputDir, Config config) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException { String inputFilesDir = inputDir; String outputFile = outputDir + separator + OFis; System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile); Configuration conf = new Configuration(); setConfigurationValues(conf, config); Job job = new Job(conf, "Start Mining"); job.setJarByClass(DistEclatDriver.class); job.setOutputKeyClass(Text.class); if (config.getWriteSets()) { job.setOutputValueClass(Text.class); job.setMapperClass(EclatMinerMapper.class); job.setReducerClass(EclatMinerReducer.class); } else { job.setOutputValueClass(LongWritable.class); job.setMapperClass(EclatMinerMapperSetCount.class); job.setReducerClass(EclatMinerReducerSetCount.class); } job.setInputFormatClass(NoSplitSequenceFileInputFormat.class); List<Path> inputPaths = new ArrayList<Path>(); FileSystem fs = FileSystem.get(conf); FileStatus[] listStatus = fs.globStatus(new Path(inputFilesDir + "bucket*")); for (FileStatus fstat : listStatus) { inputPaths.add(fstat.getPath()); } FileInputFormat.setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()])); FileOutputFormat.setOutputPath(job, new Path(outputFile)); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); long start = System.currentTimeMillis(); job.waitForCompletion(true); long end = System.currentTimeMillis(); System.out.println("[Mining]: Took " + (end - start) / 1000 + "s"); fs.close(); }