List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable
public SequenceFileDirIterable(Path path, PathType pathType, Configuration conf)
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {// w ww . j a v a2s . c o m System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java
@Override public void performWork(Path inputDoc, Path outputDir) { try {/*from ww w . j a v a 2s .com*/ HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder docName = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); String value = pair.getSecond().toString(); docName.append(key); StringTuple document; try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) { CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); } tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(outputDir, docName.toString()); // overwrite old vector file ClusterFileService.FS.delete(tokenizedSeq, true); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); } } catch (IOException e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:net.aprendizajengrande.ontocluster.Clusterer.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(/*from www . ja va 2 s.c om*/ "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); DistanceMeasure measure = new CosineDistanceMeasure(); long seed = 67241; int numClusters = 250; int numIterations = 500; // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); Path input = new Path(args[0] + "/input"); // first centroids are an input parameter to clustering Path clusters = new Path(args[0] + "/clusters"); clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed); Path output = new Path(args[1]); // cluster KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:net.aprendizajengrande.ontocluster.ClusterExtractor.java
License:Open Source License
public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException { if (args.length != 3) { System.err.println(//ww w. j a v a 2 s. com "Usage: <input hdfs folder with rels> <hdfs folder for output> <local folder for output>"); System.exit(1); } Configuration conf = new Configuration(); // see // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); // crear vectores en HDFS System.out.println("Input: " + args[0]); // read the rel names, to pretty print Path inputRels = new Path(args[0] + "/rels"); FileSystem fs = inputRels.getFileSystem(conf); FSDataInputStream fsdis = fs.open(inputRels); BufferedReader br = new BufferedReader(new InputStreamReader(fsdis)); String line = br.readLine(); Map<Integer, String> relIdToName = new HashMap<>(); while (line != null) { String[] parts = line.split("\\t"); relIdToName.put(Integer.parseInt(parts[0]), parts[1]); line = br.readLine(); } // read output Path outputFinal = findFinalClusters(args[1], conf); if (outputFinal == null) { System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'"); System.exit(1); } // delete the _SUCCESS file as it is problematic // see // http://stackoverflow.com/questions/10752708/eofexception-at-org-apache-hadoop-io-sequencefilereader-initsequencefile-java Path successFile = new Path(outputFinal, "_SUCCESS"); if (fs.exists(successFile)) { fs.delete(successFile, false); } SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST, conf); PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2]))); int clusterNum = 0; for (Pair<Text, Writable> p : it) { Object obj = p.getSecond(); if (!(obj instanceof ClusterWritable)) continue; pw.println(clusterNum + ") " + p.getFirst()); Cluster cluster = ((ClusterWritable) obj).getValue(); Vector center = cluster.getCenter(); for (int i = 0; i < center.size(); i++) { String name = relIdToName.get(i); if (name == null) name = "?"; if (center.get(i) >= 0.01) pw.println("\t" + name + ": " + center.get(i)); } pw.println(); clusterNum++; } pw.close(); }
From source file:org.conan.mymahout.clustering.streaming.tools.ResplitSequenceFiles.java
License:Apache License
private void run(PrintWriter printWriter) throws IOException { conf = new Configuration(); SequenceFileDirIterable<Writable, Writable> inputIterable = new SequenceFileDirIterable<Writable, Writable>( new Path(inputFile), PathType.LIST, conf); fs = FileSystem.get(conf);// ww w . jav a 2 s .c om int numEntries = Iterables.size(inputIterable); int numEntriesPerSplit = numEntries / numSplits; int numEntriesLastSplit = numEntriesPerSplit + numEntries - numEntriesPerSplit * numSplits; Iterator<Pair<Writable, Writable>> inputIterator = inputIterable.iterator(); printWriter.printf("Writing %d splits\n", numSplits); for (int i = 0; i < numSplits - 1; ++i) { printWriter.printf("Writing split %d\n", i); writeSplit(inputIterator, i, numEntriesPerSplit); } printWriter.printf("Writing split %d\n", numSplits - 1); writeSplit(inputIterator, numSplits - 1, numEntriesLastSplit); }