Example usage for org.apache.mahout.common.iterator.sequencefile PathFilters logsCRCFilter

List of usage examples for org.apache.mahout.common.iterator.sequencefile PathFilters logsCRCFilter

Introduction

In this page you can find the example usage for org.apache.mahout.common.iterator.sequencefile PathFilters logsCRCFilter.

Prototype

public static PathFilter logsCRCFilter() 

Source Link

Usage

From source file:DisplayClustering.java

License:Apache License

protected static List<Cluster> readClustersWritable(Path clustersIn) {
    List<Cluster> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
            PathFilters.logsCRCFilter(), conf)) {
        Cluster cluster = value.getValue();
        log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}", cluster.getId(),
                AbstractCluster.formatVector(cluster.getCenter(), null), cluster.getNumObservations(),
                AbstractCluster.formatVector(cluster.getRadius(), null));
        clusters.add(cluster);/*from   w w w.j  a va 2s  .c om*/
    }
    return clusters;
}

From source file:ac.keio.sslab.nlp.lda.RowIdJob.java

License:Apache License

@SuppressWarnings("deprecation")
@Override//  w  ww.  j  a  va 2s .  c  om
public int run(String[] args) throws Exception {

    addInputOption();
    addOutputOption();

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Path outputPath = getOutputPath();
    Path indexPath = new Path(outputPath, "docIndex");
    Path matrixPath = new Path(outputPath, "matrix");

    try (SequenceFile.Writer indexWriter = SequenceFile.createWriter(fs, conf, indexPath, IntWritable.class,
            Text.class);
            SequenceFile.Writer matrixWriter = SequenceFile.createWriter(fs, conf, matrixPath,
                    IntWritable.class, VectorWritable.class)) {
        IntWritable docId = new IntWritable();
        int i = 0;
        int numCols = 0;
        for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
                getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), null, true, conf)) {
            VectorWritable value = record.getSecond();
            docId.set(i);
            indexWriter.append(docId, record.getFirst());
            matrixWriter.append(docId, value);
            i++;
            numCols = value.get().size();
        }

        log.info("Wrote out matrix with {} rows and {} columns to {}", i, numCols, matrixPath);
        return 0;
    }
}

From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.DistributedRowMatrix.java

License:Apache License

@Override
public Iterator<MatrixSlice> iterateAll() {
    try {/*w w w.  j  av a 2s .co m*/
        Path pathPattern = rowPath;
        if (FileSystem.get(conf).getFileStatus(rowPath).isDir()) {
            pathPattern = new Path(rowPath, "*");
        }
        return Iterators.transform(
                new SequenceFileDirIterator<IntWritable, VectorWritable>(pathPattern, PathType.GLOB,
                        PathFilters.logsCRCFilter(), null, true, conf),
                new Function<Pair<IntWritable, VectorWritable>, MatrixSlice>() {
                    @Override
                    public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) {
                        return new MatrixSlice(from.getSecond().get(), from.getFirst().get());
                    }
                });
    } catch (IOException ioe) {
        throw new IllegalStateException(ioe);
    }
}

From source file:com.elex.dmp.lda.InMemoryCollapsedVariationalBayes0.java

License:Apache License

private static Matrix loadVectors(String vectorPathString, Configuration conf) throws IOException {
    Path vectorPath = new Path(vectorPathString);
    FileSystem fs = vectorPath.getFileSystem(conf);
    List<Path> subPaths = Lists.newArrayList();
    if (fs.isFile(vectorPath)) {
        subPaths.add(vectorPath);//  w  w w.j  a va 2  s  .  c  om
    } else {
        for (FileStatus fileStatus : fs.listStatus(vectorPath, PathFilters.logsCRCFilter())) {
            subPaths.add(fileStatus.getPath());
        }
    }
    List<Vector> vectorList = Lists.newArrayList();
    for (Path subPath : subPaths) {
        for (Pair<IntWritable, VectorWritable> record : new SequenceFileIterable<IntWritable, VectorWritable>(
                subPath, true, conf)) {
            vectorList.add(record.getSecond().get());
        }
    }
    int numRows = vectorList.size();
    int numCols = vectorList.get(0).size();
    return new SparseRowMatrix(numRows, numCols, vectorList.toArray(new Vector[vectorList.size()]), true,
            vectorList.get(0).isSequentialAccess());
}

From source file:com.ikanow.infinit.e.processing.custom.utils.HadoopUtils.java

License:Open Source License

public static BasicDBList getBsonFromSequenceFile(CustomMapReduceJobPojo cmr, int nLimit, String fields)
        throws SAXException, IOException, ParserConfigurationException {

    BasicDBList dbl = new BasicDBList();

    PropertiesManager props = new PropertiesManager();
    Configuration conf = getConfiguration(props);

    Path pathDir = HadoopUtils.getPathForJob(cmr, conf, false);

    @SuppressWarnings({ "unchecked", "rawtypes" })
    SequenceFileDirIterable<? extends Writable, ? extends Writable> seqFileDir = new SequenceFileDirIterable(
            pathDir, PathType.LIST, PathFilters.logsCRCFilter(), conf);

    // Very basic, only allow top level, 1 level of nesting, and field removal
    HashSet<String> fieldLookup = null;
    if (null != fields) {
        fieldLookup = new HashSet<String>();
        String[] fieldArray = fields.split(",");
        for (String field : fieldArray) {
            String[] fieldDecomp = field.split(":");
            fieldLookup.add(fieldDecomp[0]);
        }//from   w w w .  j  a v a  2 s  .c om
    } //TOTEST

    int nRecords = 0;
    for (Pair<? extends Writable, ? extends Writable> record : seqFileDir) {
        BasicDBObject element = new BasicDBObject();

        // KEY

        Writable key = record.getFirst();
        if (key instanceof org.apache.hadoop.io.Text) {
            org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) key;
            element.put("key", writable.toString());
        } else if (key instanceof org.apache.hadoop.io.DoubleWritable) {
            org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) key;
            element.put("key", Double.toString(writable.get()));
        } else if (key instanceof org.apache.hadoop.io.IntWritable) {
            org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) key;
            element.put("key", Integer.toString(writable.get()));
        } else if (key instanceof org.apache.hadoop.io.LongWritable) {
            org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) key;
            element.put("key", Long.toString(writable.get()));
        } else if (key instanceof BSONWritable) {
            element.put("key", MongoDbUtil.convert((BSONWritable) key));
        }

        // VALUE

        Writable value = record.getSecond();
        if (value instanceof org.apache.hadoop.io.Text) {
            org.apache.hadoop.io.Text writable = (org.apache.hadoop.io.Text) value;
            element.put("value", writable.toString());
        } else if (value instanceof org.apache.hadoop.io.DoubleWritable) {
            org.apache.hadoop.io.DoubleWritable writable = (org.apache.hadoop.io.DoubleWritable) value;
            element.put("value", Double.toString(writable.get()));
        } else if (value instanceof org.apache.hadoop.io.IntWritable) {
            org.apache.hadoop.io.IntWritable writable = (org.apache.hadoop.io.IntWritable) value;
            element.put("value", Integer.toString(writable.get()));
        } else if (value instanceof org.apache.hadoop.io.LongWritable) {
            org.apache.hadoop.io.LongWritable writable = (org.apache.hadoop.io.LongWritable) value;
            element.put("value", Long.toString(writable.get()));
        } else if (value instanceof BSONWritable) {
            element.put("value", MongoDbUtil.convert((BSONWritable) value));
        } else if (value instanceof org.apache.mahout.math.VectorWritable) {
            Vector vec = ((org.apache.mahout.math.VectorWritable) value).get();
            BasicDBList dbl2 = listFromMahoutVector(vec, "value", element);
            element.put("value", dbl2);
        } else if (value instanceof org.apache.mahout.clustering.classify.WeightedVectorWritable) {
            org.apache.mahout.clustering.classify.WeightedVectorWritable vecW = (org.apache.mahout.clustering.classify.WeightedVectorWritable) value;
            element.put("valueWeight", vecW.getWeight());
            BasicDBList dbl2 = listFromMahoutVector(vecW.getVector(), "value", element);
            element.put("value", dbl2);
        } else if (value instanceof org.apache.mahout.clustering.iterator.ClusterWritable) {
            Cluster cluster = ((org.apache.mahout.clustering.iterator.ClusterWritable) value).getValue();
            BasicDBObject clusterVal = new BasicDBObject();
            clusterVal.put("center", listFromMahoutVector(cluster.getCenter(), "center", clusterVal));
            clusterVal.put("radius", listFromMahoutVector(cluster.getRadius(), "radius", clusterVal));
            element.put("value", clusterVal);
        } else {
            element.put("unknownValue", value.getClass().toString());
        }

        // Check the fields settings:
        // Only handle a few...
        if (null != fieldLookup) {
            for (String fieldToRemove : fieldLookup) {
                if (fieldToRemove.startsWith("value.")) {
                    fieldToRemove = fieldToRemove.substring(6);
                    BasicDBObject nested = (BasicDBObject) element.get("value.");
                    if (null != nested) {
                        nested.remove(fieldToRemove);
                    }
                } else {
                    element.remove(fieldToRemove);
                }
            } //TOTEST
        }

        dbl.add(element);
        nRecords++;
        if ((nLimit > 0) && (nRecords >= nLimit)) {
            break;
        }
    }

    return dbl;
}

From source file:com.modofo.molo.cluster.DisplayClustering.java

License:Apache License

protected static List<Cluster> readClustersWritable(Path clustersIn) {
    List<Cluster> clusters = Lists.newArrayList();
    Configuration conf = new Configuration();
    for (ClusterWritable value : new SequenceFileDirValueIterable<ClusterWritable>(clustersIn, PathType.LIST,
            PathFilters.logsCRCFilter(), conf)) {
        Cluster cluster = value.getValue();
        log.info("Reading Cluster:{} center:{} numPoints:{} radius:{}",
                new Object[] { cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
                        cluster.getNumObservations(),
                        AbstractCluster.formatVector(cluster.getRadius(), null) });
        clusters.add(cluster);/*  www  .j ava 2s .c o m*/
    }
    return clusters;
}

From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java

License:Apache License

private long createLabelIndex(Path labPath) throws IOException {
    long labelSize = 0;
    if (hasOption(LABELS)) {
        Iterable<String> labels = Splitter.on(",").split(getOption(LABELS));
        labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath);
    } else if (hasOption(EXTRACT_LABELS)) {
        Iterable<Pair<Text, IntWritable>> iterable = new SequenceFileDirIterable<Text, IntWritable>(
                getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), getConf());
        labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
    }//www .j a v a  2  s . c  om
    return labelSize;
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

public void splitDirectory(Configuration conf, Path inputDir)
        throws IOException, ClassNotFoundException, InterruptedException {
    FileSystem fs = inputDir.getFileSystem(conf);
    if (fs.getFileStatus(inputDir) == null) {
        throw new IOException(inputDir + " does not exist");
    }/*from   ww w.  j a  va 2  s  .  c om*/
    if (!fs.getFileStatus(inputDir).isDir()) {
        throw new IOException(inputDir + " is not a directory");
    }

    if (useMapRed) {
        SplitInputJob.run(conf, inputDir, mapRedOutputDirectory, keepPct, testRandomSelectionPct);
    } else {
        // input dir contains one file per category.
        FileStatus[] fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter());
        for (FileStatus inputFile : fileStats) {
            if (!inputFile.isDir()) {
                splitFile(inputFile.getPath());
            }
        }
    }
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static Path buildClustersSeq(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure, int maxIterations, String delta) throws IOException {

    KMeansClusterer clusterer = new KMeansClusterer(measure);
    Collection<Cluster> clusters = Lists.newArrayList();

    MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
    if (clusters.isEmpty()) {
        throw new IllegalStateException("Clusters is empty!");
    }/*  w  w w. j  av a 2  s.  co m*/
    boolean converged = false;
    int iteration = 1;
    while (!converged && iteration <= maxIterations) {
        log.info("K-Means Iteration: {}", iteration);
        FileSystem fs = FileSystem.get(input.toUri(), conf);
        for (VectorWritable value : new SequenceFileDirValueIterable<VectorWritable>(input, PathType.LIST,
                PathFilters.logsCRCFilter(), conf)) {
            clusterer.addPointToNearestCluster(value.get(), clusters);
        }
        converged = clusterer.testConvergence(clusters, Double.parseDouble(delta));
        Path clustersOut = new Path(output, AbstractCluster.CLUSTERS_DIR + iteration);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(clustersOut, "part-r-00000"),
                Text.class, Cluster.class);
        try {
            for (Cluster cluster : clusters) {
                if (log.isDebugEnabled()) {
                    log.debug("Writing Cluster:{} center:{} numPoints:{} radius:{} to: {}", new Object[] {
                            cluster.getId(), AbstractCluster.formatVector(cluster.getCenter(), null),
                            cluster.getNumPoints(), AbstractCluster.formatVector(cluster.getRadius(), null),
                            clustersOut.getName() });
                }
                writer.append(new Text(cluster.getIdentifier()), cluster);
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
        clustersIn = clustersOut;
        iteration++;
    }
    Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)
            + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX);
    FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR + (iteration - 1)),
            finalClustersIn);
    return finalClustersIn;
}

From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java

License:Apache License

private static void clusterDataSeq(Configuration conf, Path input, Path clustersIn, Path output,
        DistanceMeasure measure) throws IOException {

    KMeansClusterer clusterer = new KMeansClusterer(measure);
    Collection<Cluster> clusters = Lists.newArrayList();
    MemKMeansUtil.configureWithClusterInfo(conf, clustersIn, clusters);
    if (clusters.isEmpty()) {
        throw new IllegalStateException("Clusters is empty!");
    }/* w w w  .j a  v a  2 s  .c o m*/
    FileSystem fs = FileSystem.get(input.toUri(), conf);
    FileStatus[] status = fs.listStatus(input, PathFilters.logsCRCFilter());
    int part = 0;
    for (FileStatus s : status) {
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(output, "part-m-" + part),
                IntWritable.class, WeightedVectorWritable.class);
        try {
            for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(s.getPath(), conf)) {
                clusterer.emitPointToNearestCluster(value.get(), clusters, writer);
            }
        } finally {
            Closeables.closeQuietly(writer);
        }
    }

}