Example usage for org.apache.hadoop.mapred SequenceFileRecordReader SequenceFileRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileRecordReader SequenceFileRecordReader.

Prototype

public SequenceFileRecordReader(Configuration conf, FileSplit split) throws IOException

Source Link

Usage

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    reporter.setStatus(split.toString());

    return new CommonCombineRecordReader(job, (CombineFileSplit) split,
            new CommonCombineRecordReader.RecordReaderEngineerer() {
                @Override/* w  w w .  j  av a 2  s. c om*/
                public RecordReader createRecordReader(Configuration conf, FileSplit split) throws IOException {
                    return new SequenceFileRecordReader<K, V>(conf, split);
                }
            });
}

From source file:com.conductor.s3.S3SequenceFileInputFormatMRV1.java

License:Apache License

@Override
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    reporter.setStatus(split.toString());
    return new SequenceFileRecordReader<K, V>(job, (FileSplit) split);
}

From source file:com.ibm.bi.dml.runtime.matrix.sort.SamplingSortMRInputFormat.java

License:Open Source License

@Override
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    if (reporter != null)
        reporter.setStatus(split.toString());
    return new SequenceFileRecordReader<K, V>(job, (FileSplit) split);
}

From source file:edu.ucsb.cs.hadoop.CustomSequenceFileInputFormat.java

License:Apache License

@Override
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {

    reporter.setStatus(split.toString());

    return new SequenceFileRecordReader<K, V>(job, (FileSplit) split);
}

From source file:hitune.analysis.mapreduce.MultiSequenceFileReader.java

License:Apache License

/**
 * /* w  ww .j  av a 2s.  c o  m*/
 */
public MultiSequenceFileReader(Configuration conf, MultiFileSplit split) throws IOException {
    // TODO Auto-generated constructor stub
    this.split = split;
    this.conf = conf;
    paths = this.split.getPaths();
    fs = FileSystem.get(conf);
    totalLength = split.getLength();
    pos = 0;
    log.debug("total split number:" + split.getNumPaths());
    log.debug("open split:" + paths[0].toString());
    FileSplit filesplit = new FileSplit(paths[0], 0, split.getLength(0), (JobConf) conf);
    reader = new SequenceFileRecordReader<ChukwaRecordKey, ChukwaRecord>(conf, filesplit);
    if (reader == null) {
        log.warn("open split failed!");
    }
}

From source file:hitune.analysis.mapreduce.MultiSequenceFileReader.java

License:Apache License

@Override
public boolean next(ChukwaRecordKey key, ChukwaRecord value) throws IOException {
    // TODO Auto-generated method stub
    if (reader == null) {
        throw new IOException("reader is empty");
    } else {/*from   www  . j  a  va 2 s  . co  m*/
        more = reader.next(key, value);
    }
    while (!more) {
        if (reader != null) {
            log.debug("close previous reader:" + count);
            reader.close();
            reader = null;
        }
        pos += split.getLength(count);

        if (pos < totalLength) {
            count++;
            log.debug("current split number:" + count);
            log.debug("open slit: " + paths[count]);
            FileSplit filesplit = new FileSplit(paths[count], 0, split.getLength(count), (JobConf) conf);
            reader = new SequenceFileRecordReader<ChukwaRecordKey, ChukwaRecord>(conf, filesplit);
            if (reader == null) {
                throw new IOException("reader is empty");
            } else {
                more = reader.next(key, value);
            }
        } else {
            break;
        }

    }
    return more;
}

From source file:ml.shifu.guagua.hadoop.io.GuaguaSequenceRecordReader.java

License:Apache License

@Override
public void initialize(GuaguaFileSplit split) throws IOException {
    FileSplit fileSplit = new FileSplit(new Path(split.getPath()), split.getOffset(), split.getLength(),
            (String[]) null);/*from   w  ww. ja  v  a 2  s  .  com*/
    this.sequenceReader = new SequenceFileRecordReader<KEY, VALUE>(conf, fileSplit);
}

From source file:mlbench.bayes.test.BayesTest.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from   w  w  w. j ava 2 s . c  o  m*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);

    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, config);
        classifier = new StandardNaiveBayesClassifier(model);

        MPI_D.COMM_BIPARTITE_O.Barrier();
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                (JobConf) config, inDir, rank);

        for (int i = 0; i < inputs.length; i++) {
            FileSplit fsplit = inputs[i];
            SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                    fsplit);
            Text key = kvrr.createKey();
            VectorWritable value = kvrr.createValue();
            while (kvrr.next(key, value)) {
                Vector result = classifier.classifyFull(value.get());
                MPI_D.Send(new Text(SLASH.split(key.toString())[1]), new VectorWritable(result));
            }
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDir);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(Text.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<Text, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }
        Text key = null;
        VectorWritable point = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            key = (Text) vals[0];
            point = (VectorWritable) vals[1];
            if (key != null && point != null) {
                vector = point.get();
                outrw.write(key, new VectorWritable(vector));
            }
            vals = MPI_D.Recv();
        }
        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }

        MPI_D.COMM_BIPARTITE_A.Barrier();
        if (rank == 0) {
            // load the labels
            Map<Integer, String> labelMap = BayesUtils.readLabelIndex(config, labPath);
            // loop over the results and create the confusion matrix
            SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>(
                    output, PathType.LIST, PathFilters.partFilter(), config);
            ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
            analyzeResults(labelMap, dirIterable, analyzer);
        }
    }
    MPI_D.Finalize();
}

From source file:mlbench.bayes.train.IndexInstances.java

License:Apache License

@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from  ww  w.  j av a 2 s .  com*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            System.out.println(IndexInstances.class.getSimpleName() + " O start.");
            createLabelIndex(labPath);
        }

        HadoopUtil.cacheFiles(labPath, config);

        MPI_D.COMM_BIPARTITE_O.Barrier();

        OpenObjectIntHashMap<String> labelIndex = BayesUtils.readIndexFromCache(config);

        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
            FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                    (JobConf) config, inDir, rank);
            for (int i = 0; i < inputs.length; i++) {
                FileSplit fsplit = inputs[i];
                SequenceFileRecordReader<Text, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                        fsplit);
                Text labelText = kvrr.createKey();
                VectorWritable instance = kvrr.createValue();
                while (kvrr.next(labelText, instance)) {
                    String label = SLASH.split(labelText.toString())[1];
                    if (labelIndex.containsKey(label)) {
                        MPI_D.Send(new IntWritable(labelIndex.get(label)), instance);
                    }
                }
            }
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDir);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(IntWritable.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<IntWritable, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<IntWritable, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }
    }

    MPI_D.Finalize();
}

From source file:mlbench.bayes.train.WeightSummer.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws MPI_D_Exception, IOException, MPIException {
    parseArgs(args);/*from   w  w  w .  j a  v  a2 s.  c om*/
    HashMap<String, String> conf = new HashMap<String, String>();
    initConf(conf);
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {

        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                (JobConf) config, inDir, rank);
        Vector weightsPerFeature = null;
        Vector weightsPerLabel = new DenseVector(labNum);

        for (int i = 0; i < inputs.length; i++) {
            FileSplit fsplit = inputs[i];
            SequenceFileRecordReader<IntWritable, VectorWritable> kvrr = new SequenceFileRecordReader<>(config,
                    fsplit);
            IntWritable index = kvrr.createKey();
            VectorWritable value = kvrr.createValue();
            while (kvrr.next(index, value)) {
                Vector instance = value.get();
                if (weightsPerFeature == null) {
                    weightsPerFeature = new RandomAccessSparseVector(instance.size(),
                            instance.getNumNondefaultElements());
                }

                int label = index.get();
                weightsPerFeature.assign(instance, Functions.PLUS);
                weightsPerLabel.set(label, weightsPerLabel.get(label) + instance.zSum());
            }
        }
        if (weightsPerFeature != null) {
            MPI_D.Send(new Text(WEIGHTS_PER_FEATURE), new VectorWritable(weightsPerFeature));
            MPI_D.Send(new Text(WEIGHTS_PER_LABEL), new VectorWritable(weightsPerLabel));
        }
    } else if (MPI_D.COMM_BIPARTITE_A != null) {
        int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
        config.set(MAPRED_OUTPUT_DIR, outDirW);
        config.set("mapred.task.id", DataMPIUtil.getHadoopTaskAttemptID().toString().toString());
        ((JobConf) config).setOutputKeyClass(Text.class);
        ((JobConf) config).setOutputValueClass(VectorWritable.class);
        TaskAttemptContext taskContext = new TaskAttemptContextImpl(config,
                DataMPIUtil.getHadoopTaskAttemptID());
        SequenceFileOutputFormat<Text, VectorWritable> outfile = new SequenceFileOutputFormat<>();
        FileSystem fs = FileSystem.get(config);

        Path output = new Path(config.get(MAPRED_OUTPUT_DIR));
        FileOutputCommitter fcommitter = new FileOutputCommitter(output, taskContext);
        RecordWriter<Text, VectorWritable> outrw = null;
        try {
            fcommitter.setupJob(taskContext);
            outrw = outfile.getRecordWriter(fs, (JobConf) config, getOutputName(rank), null);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("ERROR: Please set the HDFS configuration properly\n");
            System.exit(-1);
        }

        Text key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        Vector vector = null;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (Text) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
            } else if (!key.equals(newKey)) {
                outrw.write(key, new VectorWritable(vector));
                vector = null;
            }
            if (vector == null) {
                vector = newPoint.get();
            } else {
                vector.assign(newPoint.get(), Functions.PLUS);
            }

            key = newKey;
            point = newPoint;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            outrw.write(key, new VectorWritable(vector));
        }

        outrw.close(null);
        if (fcommitter.needsTaskCommit(taskContext)) {
            fcommitter.commitTask(taskContext);
        }

        MPI_D.COMM_BIPARTITE_A.Barrier();
        if (rank == 0) {
            Path resOut = new Path(outDir);
            NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(new Path(outDir), config);
            naiveBayesModel.serialize(resOut, config);
        }
    }

    MPI_D.Finalize();
}