Example usage for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader

List of usage examples for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader.

Prototype

public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException 

Source Link

Usage

From source file:edu.uci.ics.asterix.external.indexing.input.TextualDataReader.java

License:Apache License

private RecordReader getRecordReader(int splitIndex) throws IOException {
    RecordReader reader;//from   ww w .j a v  a  2s  .c  om
    if (conf.getInputFormat() instanceof SequenceFileInputFormat) {
        SequenceFileInputFormat format = (SequenceFileInputFormat) conf.getInputFormat();
        reader = format.getRecordReader((org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf,
                getReporter());
    } else {
        TextInputFormat format = (TextInputFormat) conf.getInputFormat();
        reader = format.getRecordReader((org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf,
                getReporter());
    }
    return reader;
}

From source file:edu.uci.ics.asterix.external.indexing.input.TextualFullScanDataReader.java

License:Apache License

@SuppressWarnings("rawtypes")
private RecordReader getRecordReader(int splitIndex) throws IOException {
    if (conf.getInputFormat() instanceof SequenceFileInputFormat) {
        SequenceFileInputFormat format = (SequenceFileInputFormat) conf.getInputFormat();
        RecordReader reader = format.getRecordReader(
                (org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter());
        return reader;
    } else {//  ww  w  .j  av a2s  . c  o m
        TextInputFormat format = (TextInputFormat) conf.getInputFormat();
        RecordReader reader = format.getRecordReader(
                (org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter());
        return reader;
    }
}

From source file:mlbench.kmeans.KmeansInit.java

License:Apache License

/**
 * get the input values and choose the K clusters' centers
 *
 * @param dataPath/*w w  w. j a v  a 2 s  .  co  m*/
 * @throws MPI_D_Exception
 * @throws IOException
 * @throws MPIException
 */
@SuppressWarnings("deprecation")
private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf)
        throws MPI_D_Exception, IOException, MPIException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);

            Random random = new Random(1000);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            IntWritable cluster = new IntWritable();
            while (reader.next(k, v)) {
                cluster.set(random.nextInt(kCluster));
                MPI_D.Send(cluster, v);
            }
            reader.close();
        }
    } else {
        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        double sum[] = null;
        int count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
                sum = new double[newPoint.get().size()];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[point.get().size()];
                count = 0;
            }
            key = newKey;
            point = newPoint;
            KmeansUtils.accumulate(sum, newPoint.get());
            count++;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }

        System.out.println("rank " + rank + " finish");
    }
    MPI_D.Finalize();
}

From source file:mlbench.kmeans.KmeansIter.java

License:Apache License

/**
 * Calculate the new center iteratively//  w w w  . jav a  2 s . c  o m
 *
 * @return true: finish; false: continue
 * @throws MPI_D_Exception
 * @throws MPIException
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void iterBody(String args[], HashMap<String, String> conf)
        throws MPI_D_Exception, MPIException, IOException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);

    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            System.out.println(centerPath);
            DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config);

            String lineVal;
            try {
                while ((lineVal = in.readLine()) != null) {
                    String lineSeq[] = lineVal.split(":");
                    PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1]));
                    centers.add(p);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        MPI_D.COMM_BIPARTITE_O.Barrier();

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.broadcastCenters(centers);

        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);
        double centerSum[][] = new double[kCluster][];
        long centerPNum[] = new long[kCluster];

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            while (reader.next(k, v)) {
                int centerBelong = (int) getBelongPoint(v);
                //                    int i = (int) p.getStrClusterClass();
                //                    double[] vals = p.getDoubleValue();
                int len = v.get().size();
                if (centerSum[centerBelong] == null) {
                    centerSum[centerBelong] = new double[len];
                }
                for (int j = 0; j < len; j++) {
                    centerSum[centerBelong][j] += v.get().get(j);
                }
                centerPNum[centerBelong]++;
            }
            reader.close();
        }

        for (int i = 0; i < centerPNum.length; i++) {
            if (centerSum[i] == null && centerPNum[i] == 0) {
                continue;
            }
            MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i]));
        }
    } else {
        centers.clear();
        IntWritable key = null, newKey = null;
        KmeansCenters value = null, newValue = null;
        double sum[] = null;
        long count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newValue = (KmeansCenters) vals[1];
            if (key == null && value == null) {
                sum = new double[newValue.getVector().length];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = (double) sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[value.getVector().length];
                count = 0;
            }
            key = newKey;
            value = newValue;
            KmeansUtils.accumulate(sum, newValue.getVector());
            count += Long.valueOf(newValue.getPointSize());
            vals = MPI_D.Recv();
        }
        if (newKey != null && newValue != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }
    }
    MPI_D.Finalize();
}