Example usage for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader

List of usage examples for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader


In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader.


public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException 

Source Link


From source file:edu.uci.ics.asterix.external.indexing.input.TextualDataReader.java

License:Apache License

private RecordReader getRecordReader(int splitIndex) throws IOException {
    RecordReader reader;//from   ww w .j a v  a  2s  .c  om
    if (conf.getInputFormat() instanceof SequenceFileInputFormat) {
        SequenceFileInputFormat format = (SequenceFileInputFormat) conf.getInputFormat();
        reader = format.getRecordReader((org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf,
    } else {
        TextInputFormat format = (TextInputFormat) conf.getInputFormat();
        reader = format.getRecordReader((org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf,
    return reader;

From source file:edu.uci.ics.asterix.external.indexing.input.TextualFullScanDataReader.java

License:Apache License

private RecordReader getRecordReader(int splitIndex) throws IOException {
    if (conf.getInputFormat() instanceof SequenceFileInputFormat) {
        SequenceFileInputFormat format = (SequenceFileInputFormat) conf.getInputFormat();
        RecordReader reader = format.getRecordReader(
                (org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter());
        return reader;
    } else {//  ww  w  .j  av a2s  . c  o m
        TextInputFormat format = (TextInputFormat) conf.getInputFormat();
        RecordReader reader = format.getRecordReader(
                (org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter());
        return reader;

From source file:mlbench.kmeans.KmeansInit.java

License:Apache License

 * get the input values and choose the K clusters' centers
 * @param dataPath/*w w  w. j a v  a 2 s  .  co  m*/
 * @throws MPI_D_Exception
 * @throws IOException
 * @throws MPIException
private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf)
        throws MPI_D_Exception, IOException, MPIException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);

            Random random = new Random(1000);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            IntWritable cluster = new IntWritable();
            while (reader.next(k, v)) {
                MPI_D.Send(cluster, v);
    } else {
        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        double sum[] = null;
        int count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
                sum = new double[newPoint.get().size()];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = sum[i] / count;
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                sum = new double[point.get().size()];
                count = 0;
            key = newKey;
            point = newPoint;
            KmeansUtils.accumulate(sum, newPoint.get());
            vals = MPI_D.Recv();
        if (newKey != null && newPoint != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            PointVector oneCenter = new PointVector(key.get(), centerVals);

        transfer = new KmeansUtils.CenterTransfer(config, rank, size);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());

        System.out.println("rank " + rank + " finish");

From source file:mlbench.kmeans.KmeansIter.java

License:Apache License

 * Calculate the new center iteratively//  w w w  . jav a  2 s . c  o m
 * @return true: finish; false: continue
 * @throws MPI_D_Exception
 * @throws MPIException
 * @throws IOException
private static void iterBody(String args[], HashMap<String, String> conf)
        throws MPI_D_Exception, MPIException, IOException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);

    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config);

            String lineVal;
            try {
                while ((lineVal = in.readLine()) != null) {
                    String lineSeq[] = lineVal.split(":");
                    PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1]));
            } catch (IOException e) {
            } finally {
                try {
                } catch (IOException e) {

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);

        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);
        double centerSum[][] = new double[kCluster][];
        long centerPNum[] = new long[kCluster];

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            while (reader.next(k, v)) {
                int centerBelong = (int) getBelongPoint(v);
                //                    int i = (int) p.getStrClusterClass();
                //                    double[] vals = p.getDoubleValue();
                int len = v.get().size();
                if (centerSum[centerBelong] == null) {
                    centerSum[centerBelong] = new double[len];
                for (int j = 0; j < len; j++) {
                    centerSum[centerBelong][j] += v.get().get(j);

        for (int i = 0; i < centerPNum.length; i++) {
            if (centerSum[i] == null && centerPNum[i] == 0) {
            MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i]));
    } else {
        IntWritable key = null, newKey = null;
        KmeansCenters value = null, newValue = null;
        double sum[] = null;
        long count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newValue = (KmeansCenters) vals[1];
            if (key == null && value == null) {
                sum = new double[newValue.getVector().length];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = (double) sum[i] / count;
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                sum = new double[value.getVector().length];
                count = 0;
            key = newKey;
            value = newValue;
            KmeansUtils.accumulate(sum, newValue.getVector());
            count += Long.valueOf(newValue.getPointSize());
            vals = MPI_D.Recv();
        if (newKey != null && newValue != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            PointVector oneCenter = new PointVector(key.get(), centerVals);

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());