Example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable

List of usage examples for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile SequenceFileDirIterable SequenceFileDirIterable.

Prototype

public SequenceFileDirIterable(Path path, PathType pathType, Configuration conf) 

Source Link

Usage

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {// w ww  .  j  a v a2s . c  o  m
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java

@Override
public void performWork(Path inputDoc, Path outputDir) {
    try {/*from  ww w .  j  a  v  a  2s  .com*/
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder docName = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            String value = pair.getSecond().toString();
            docName.append(key);
            StringTuple document;
            try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) {
                CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                document = new StringTuple();
                while (stream.incrementToken()) {
                    if (termAtt.length() > 0) {
                        document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                    }
                }
                stream.end();
            }
            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(outputDir, docName.toString());
        // overwrite old vector file
        ClusterFileService.FS.delete(tokenizedSeq, true);
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:net.aprendizajengrande.ontocluster.Clusterer.java

License:Open Source License

public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {

    if (args.length != 3) {
        System.err.println(/*from www . ja  va 2  s.c  om*/
                "Usage: <input hdfs folder with vectors> <hdfs folder for output> <local folder for output>");
        System.exit(1);
    }

    Configuration conf = new Configuration();
    DistanceMeasure measure = new CosineDistanceMeasure();
    long seed = 67241;
    int numClusters = 250;
    int numIterations = 500;

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    // crear vectores en HDFS
    System.out.println("Input: " + args[0]);
    Path input = new Path(args[0] + "/input");

    // first centroids are an input parameter to clustering
    Path clusters = new Path(args[0] + "/clusters");
    clusters = RandomSeedGenerator.buildRandom(conf, input, clusters, numClusters, measure, seed);

    Path output = new Path(args[1]);

    // cluster
    KMeansDriver.run(input, clusters, output, 0.005, numIterations, true, 0.0, false);

    // read the rel names, to pretty print

    Path inputRels = new Path(args[0] + "/rels");
    FileSystem fs = inputRels.getFileSystem(conf);
    FSDataInputStream fsdis = fs.open(inputRels);
    BufferedReader br = new BufferedReader(new InputStreamReader(fsdis));
    String line = br.readLine();
    Map<Integer, String> relIdToName = new HashMap<>();
    while (line != null) {
        String[] parts = line.split("\\t");
        relIdToName.put(Integer.parseInt(parts[0]), parts[1]);
        line = br.readLine();
    }

    // read output
    Path outputFinal = ClusterExtractor.findFinalClusters(args[1], conf);
    if (outputFinal == null) {
        System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'");
        System.exit(1);
    }
    Path successFile = new Path(outputFinal, "_SUCCESS");
    if (fs.exists(successFile)) {
        fs.delete(successFile, false);
    }

    SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST,
            conf);

    PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2])));

    int clusterNum = 0;
    for (Pair<Text, Writable> p : it) {
        Object obj = p.getSecond();
        if (!(obj instanceof ClusterWritable))
            continue;
        pw.println(clusterNum + ") " + p.getFirst());
        Cluster cluster = ((ClusterWritable) obj).getValue();
        Vector center = cluster.getCenter();
        for (int i = 0; i < center.size(); i++) {
            String name = relIdToName.get(i);
            if (name == null)
                name = "?";
            if (center.get(i) >= 0.01)
                pw.println("\t" + name + ": " + center.get(i));
        }
        pw.println();
        clusterNum++;
    }
    pw.close();
}

From source file:net.aprendizajengrande.ontocluster.ClusterExtractor.java

License:Open Source License

public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException {

    if (args.length != 3) {
        System.err.println(//ww  w.  j a  v a 2  s. com
                "Usage: <input hdfs folder with rels> <hdfs folder for output> <local folder for output>");
        System.exit(1);
    }

    Configuration conf = new Configuration();

    // see
    // http://stackoverflow.com/questions/17265002/hadoop-no-filesystem-for-scheme-file
    conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
    conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());

    // crear vectores en HDFS
    System.out.println("Input: " + args[0]);

    // read the rel names, to pretty print

    Path inputRels = new Path(args[0] + "/rels");
    FileSystem fs = inputRels.getFileSystem(conf);
    FSDataInputStream fsdis = fs.open(inputRels);
    BufferedReader br = new BufferedReader(new InputStreamReader(fsdis));
    String line = br.readLine();
    Map<Integer, String> relIdToName = new HashMap<>();
    while (line != null) {
        String[] parts = line.split("\\t");
        relIdToName.put(Integer.parseInt(parts[0]), parts[1]);
        line = br.readLine();
    }

    // read output
    Path outputFinal = findFinalClusters(args[1], conf);
    if (outputFinal == null) {
        System.err.println("Couldn't find final clusters at '" + args[1] + "-\\d+-final'");
        System.exit(1);
    }

    // delete the _SUCCESS file as it is problematic
    // see
    // http://stackoverflow.com/questions/10752708/eofexception-at-org-apache-hadoop-io-sequencefilereader-initsequencefile-java
    Path successFile = new Path(outputFinal, "_SUCCESS");
    if (fs.exists(successFile)) {
        fs.delete(successFile, false);
    }

    SequenceFileDirIterable<Text, Writable> it = new SequenceFileDirIterable<>(outputFinal, PathType.LIST,
            conf);

    PrintWriter pw = new PrintWriter(new FileWriter(new File(args[2])));

    int clusterNum = 0;
    for (Pair<Text, Writable> p : it) {
        Object obj = p.getSecond();
        if (!(obj instanceof ClusterWritable))
            continue;
        pw.println(clusterNum + ") " + p.getFirst());
        Cluster cluster = ((ClusterWritable) obj).getValue();
        Vector center = cluster.getCenter();
        for (int i = 0; i < center.size(); i++) {
            String name = relIdToName.get(i);
            if (name == null)
                name = "?";
            if (center.get(i) >= 0.01)
                pw.println("\t" + name + ": " + center.get(i));
        }
        pw.println();
        clusterNum++;
    }
    pw.close();
}

From source file:org.conan.mymahout.clustering.streaming.tools.ResplitSequenceFiles.java

License:Apache License

private void run(PrintWriter printWriter) throws IOException {
    conf = new Configuration();
    SequenceFileDirIterable<Writable, Writable> inputIterable = new SequenceFileDirIterable<Writable, Writable>(
            new Path(inputFile), PathType.LIST, conf);
    fs = FileSystem.get(conf);// ww w . jav a  2  s .c  om

    int numEntries = Iterables.size(inputIterable);
    int numEntriesPerSplit = numEntries / numSplits;
    int numEntriesLastSplit = numEntriesPerSplit + numEntries - numEntriesPerSplit * numSplits;
    Iterator<Pair<Writable, Writable>> inputIterator = inputIterable.iterator();

    printWriter.printf("Writing %d splits\n", numSplits);
    for (int i = 0; i < numSplits - 1; ++i) {
        printWriter.printf("Writing split %d\n", i);
        writeSplit(inputIterator, i, numEntriesPerSplit);
    }
    printWriter.printf("Writing split %d\n", numSplits - 1);
    writeSplit(inputIterator, numSplits - 1, numEntriesLastSplit);
}