List of usage examples for org.apache.mahout.common.iterator.sequencefile PathType GLOB
PathType GLOB
To view the source code for org.apache.mahout.common.iterator.sequencefile PathType GLOB.
Click Source Link
From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.DistributedRowMatrix.java
License:Apache License
@Override public Iterator<MatrixSlice> iterateAll() { try {/* ww w . jav a 2 s . c om*/ Path pathPattern = rowPath; if (FileSystem.get(conf).getFileStatus(rowPath).isDir()) { pathPattern = new Path(rowPath, "*"); } return Iterators.transform( new SequenceFileDirIterator<IntWritable, VectorWritable>(pathPattern, PathType.GLOB, PathFilters.logsCRCFilter(), null, true, conf), new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() { @Override public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) { return new MatrixSlice(from.getSecond().get(), from.getFirst().get()); } }); } catch (IOException ioe) { throw new IllegalStateException(ioe); } }
From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.PFPGrowth.java
License:Apache License
/** * read the feature frequency List which is built at the end of the Parallel counting job * // w w w . ja va 2s . c o m * @return Feature Frequency List */ public static List<Pair<String, Long>> readFList(Parameters params) { int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3")); Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { @Override public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); if (value >= minSupport) { queue.add(new Pair<String, Long>(record.getFirst().toString(), value)); } } List<Pair<String, Long>> fList = Lists.newArrayList(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; }
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {//from w w w . j ava2 s. c om System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java
@Override public void performWork(Path inputDoc, Path outputDir) { try {//from w w w.j av a 2 s . c o m HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder docName = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); String value = pair.getSecond().toString(); docName.append(key); StringTuple document; try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) { CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); } tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(outputDir, docName.toString()); // overwrite old vector file ClusterFileService.FS.delete(tokenizedSeq, true); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); } } catch (IOException e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Read the feature frequency List which is built at the end of the Word Count Job and assign ids to them. * This will use constant memory and will run at the speed of your disk read */// w w w . j a va 2 s. c o m private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); try { long currentChunkSize = 0; Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN); int i = 0; for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.closeQuietly(dictWriter); chunkIndex++; chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); currentChunkSize = 0; } Writable key = record.getFirst(); int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8; currentChunkSize += fieldSize; dictWriter.append(key, new IntWritable(i++)); } maxTermDimension[0] = i; } finally { Closeables.closeQuietly(dictWriter); } return chunkPaths; }
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * Read the header table which is built at the end of the Parallel counting * job./*w w w .j a va 2 s. com*/ * * @return header table */ public static List<Pair<String, Long>> readFList(Parameters params) { Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get(OUTPUT), ITEM_FREQ); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); /** * Get absolute support from relative threshold */ Long numTrans = null; for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); String feature = record.getFirst().toString(); if (feature.compareTo("dataset") == 0) { numTrans = value; break; } } Double relativeSupport = Double.valueOf(params.get(MIN_SUPPORT, "0.9")); absSupport = (int) Math.ceil((relativeSupport * numTrans)); log.info("# Transactions: " + numTrans); log.info("Support: " + relativeSupport * 100 + "%"); log.info("Support count: " + absSupport); params.set(MIN_SUPPORT, (new Long(absSupport)).toString()); for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); if (value >= absSupport) { queue.add(new Pair<String, Long>(record.getFirst().toString(), value)); } } List<Pair<String, Long>> fList = Lists.newArrayList(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
static Pair<Long[], List<Path>> createDictionaryChunks(Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath);/*from ww w . j a va 2 s . c o m*/ SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = { featureCount, vectorCount }; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
From source file:mlbench.bayes.BayesUtils.java
License:Apache License
static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath);//w ww. ja v a 2 s.c o m SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); try { long currentChunkSize = 0; Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN); int i = 0; for (Pair<Writable, Writable> record : new SequenceFileDirIterable<Writable, Writable>(filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(dictWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex); chunkPaths.add(chunkPath); dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class); currentChunkSize = 0; } Writable key = record.getFirst(); int fieldSize = DICTIONARY_BYTE_OVERHEAD + key.toString().length() * 2 + Integer.SIZE / 8; currentChunkSize += fieldSize; dictWriter.append(key, new IntWritable(i++)); } maxTermDimension[0] = i; } finally { Closeables.close(dictWriter, false); } return chunkPaths; }
From source file:org.conan.mymahout.clustering.streaming.tools.ClusterQualitySummarizer.java
License:Apache License
public int run(String[] args) throws IOException { if (!parseArgs(args)) { return -1; }//from w ww .jav a2 s . c o m Configuration conf = new Configuration(); try { // Configuration.dumpConfiguration(conf, new OutputStreamWriter(System.out)); fileOut = new PrintWriter(new FileOutputStream(outputFile)); fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3," + "distance.q4,count,is.train\n"); // Reading in the centroids (both pairs, if they exist). List<Centroid> centroids; List<Centroid> centroidsCompare = null; if (mahoutKMeansFormat) { SequenceFileDirValueIterable<ClusterWritable> clusterIterable = new SequenceFileDirValueIterable<ClusterWritable>( new Path(centroidFile), PathType.GLOB, conf); centroids = Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterIterable)); } else { SequenceFileDirValueIterable<CentroidWritable> centroidIterable = new SequenceFileDirValueIterable<CentroidWritable>( new Path(centroidFile), PathType.GLOB, conf); centroids = Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(centroidIterable)); } if (centroidCompareFile != null) { if (mahoutKMeansFormatCompare) { SequenceFileDirValueIterable<ClusterWritable> clusterCompareIterable = new SequenceFileDirValueIterable<ClusterWritable>( new Path(centroidCompareFile), PathType.GLOB, conf); centroidsCompare = Lists .newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(clusterCompareIterable)); } else { SequenceFileDirValueIterable<CentroidWritable> centroidCompareIterable = new SequenceFileDirValueIterable<CentroidWritable>( new Path(centroidCompareFile), PathType.GLOB, conf); centroidsCompare = Lists.newArrayList( IOUtils.getCentroidsFromCentroidWritableIterable(centroidCompareIterable)); } } // Reading in the "training" set. SequenceFileDirValueIterable<VectorWritable> trainIterable = new SequenceFileDirValueIterable<VectorWritable>( new Path(trainFile), PathType.GLOB, conf); Iterable<Vector> trainDatapoints = IOUtils.getVectorsFromVectorWritableIterable(trainIterable); Iterable<Vector> datapoints = trainDatapoints; printSummaries(ClusteringUtils.summarizeClusterDistances(trainDatapoints, centroids, new SquaredEuclideanDistanceMeasure()), "train"); // Also adding in the "test" set. if (testFile != null) { SequenceFileDirValueIterable<VectorWritable> testIterable = new SequenceFileDirValueIterable<VectorWritable>( new Path(testFile), PathType.GLOB, conf); Iterable<Vector> testDatapoints = IOUtils.getVectorsFromVectorWritableIterable(testIterable); printSummaries(ClusteringUtils.summarizeClusterDistances(testDatapoints, centroids, new SquaredEuclideanDistanceMeasure()), "test"); datapoints = Iterables.concat(trainDatapoints, testDatapoints); } // At this point, all train/test CSVs have been written. We now compute quality metrics. List<OnlineSummarizer> summaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroids, distanceMeasure); List<OnlineSummarizer> compareSummaries = null; if (centroidsCompare != null) { compareSummaries = ClusteringUtils.summarizeClusterDistances(datapoints, centroidsCompare, distanceMeasure); } System.out.printf("[Dunn Index] First: %f", ClusteringUtils.dunnIndex(centroids, distanceMeasure, summaries)); if (compareSummaries != null) { System.out.printf(" Second: %f\n", ClusteringUtils.dunnIndex(centroidsCompare, distanceMeasure, compareSummaries)); } else { System.out.printf("\n"); } System.out.printf("[Davies-Bouldin Index] First: %f", ClusteringUtils.daviesBouldinIndex(centroids, distanceMeasure, summaries)); if (compareSummaries != null) { System.out.printf(" Second: %f\n", ClusteringUtils.daviesBouldinIndex(centroidsCompare, distanceMeasure, compareSummaries)); } else { System.out.printf("\n"); } } catch (IOException e) { System.out.println(e.getMessage()); } finally { Closeables.close(fileOut, false); } return 0; }
From source file:tk.summerway.mahout9.tools.MyClusterDumper.java
License:Apache License
public void printClusters(String[] dictionary) throws Exception { Configuration conf = new Configuration(); if (this.termDictionary != null) { if ("text".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary)); } else if ("sequencefile".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(conf, this.termDictionary); } else {//from w ww.j a v a2 s. co m throw new IllegalArgumentException("Invalid dictionary format"); } } Writer writer; boolean shouldClose; if (this.outputFile == null) { shouldClose = false; writer = new OutputStreamWriter(System.out, Charsets.UTF_8); } else { shouldClose = true; if (outputFile.getName().startsWith("s3n://")) { Path p = outputPath; FileSystem fs = FileSystem.get(p.toUri(), conf); writer = new OutputStreamWriter(fs.create(p), Charsets.UTF_8); } else { Files.createParentDirs(outputFile); writer = Files.newWriter(this.outputFile, Charsets.UTF_8); } } ClusterWriter clusterWriter = createClusterWriter(writer, dictionary); try { long numWritten = clusterWriter.write(new SequenceFileDirValueIterable<ClusterWritable>( new Path(seqFileDir, "part-*"), PathType.GLOB, conf)); writer.flush(); if (runEvaluation) { MyClusterEvaluator ce = new MyClusterEvaluator(pointsDir.toString(), seqFileDir.toString(), "~/cluster_evaluate_result.txt", measure, 1000L); ce.evaluateClusters(conf); } // if (runEvaluation) { // HadoopUtil.delete(conf, new Path("tmp/representative")); // int numIters = 5; // RepresentativePointsDriver.main(new String[] { "--input", // seqFileDir.toString(), "--output", // "tmp/representative", "--clusteredPoints", // pointsDir.toString(), "--distanceMeasure", // measure.getClass().getName(), "--maxIter", // String.valueOf(numIters) }); // conf.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, // measure.getClass().getName()); // conf.set(RepresentativePointsDriver.STATE_IN_KEY, // "tmp/representative/representativePoints-" + numIters); // ClusterEvaluator ce = new ClusterEvaluator(conf, seqFileDir); // writer.append("\n"); // writer.append("Inter-Cluster Density: ") // .append(String.valueOf(ce.interClusterDensity())) // .append("\n"); // writer.append("Intra-Cluster Density: ") // .append(String.valueOf(ce.intraClusterDensity())) // .append("\n"); // CDbwEvaluator cdbw = new CDbwEvaluator(conf, seqFileDir); // writer.append("CDbw Inter-Cluster Density: ") // .append(String.valueOf(cdbw.interClusterDensity())) // .append("\n"); // writer.append("CDbw Intra-Cluster Density: ") // .append(String.valueOf(cdbw.intraClusterDensity())) // .append("\n"); // writer.append("CDbw Separation: ") // .append(String.valueOf(cdbw.separation())).append("\n"); // writer.flush(); // } log.info("Wrote {} clusters", numWritten); } finally { if (shouldClose) { Closeables.close(clusterWriter, false); } else { if (clusterWriter instanceof GraphMLClusterWriter) { clusterWriter.close(); } } } }