List of usage examples for org.apache.hadoop.mapred SequenceFileInputFormat getRecordReader
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException
From source file:edu.uci.ics.asterix.external.indexing.input.TextualDataReader.java
License:Apache License
private RecordReader getRecordReader(int splitIndex) throws IOException { RecordReader reader;//from ww w .j a v a 2s .c om if (conf.getInputFormat() instanceof SequenceFileInputFormat) { SequenceFileInputFormat format = (SequenceFileInputFormat) conf.getInputFormat(); reader = format.getRecordReader((org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter()); } else { TextInputFormat format = (TextInputFormat) conf.getInputFormat(); reader = format.getRecordReader((org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter()); } return reader; }
From source file:edu.uci.ics.asterix.external.indexing.input.TextualFullScanDataReader.java
License:Apache License
@SuppressWarnings("rawtypes") private RecordReader getRecordReader(int splitIndex) throws IOException { if (conf.getInputFormat() instanceof SequenceFileInputFormat) { SequenceFileInputFormat format = (SequenceFileInputFormat) conf.getInputFormat(); RecordReader reader = format.getRecordReader( (org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter()); return reader; } else {// ww w .j av a2s . c o m TextInputFormat format = (TextInputFormat) conf.getInputFormat(); RecordReader reader = format.getRecordReader( (org.apache.hadoop.mapred.FileSplit) inputSplits[splitIndex], conf, getReporter()); return reader; } }
From source file:mlbench.kmeans.KmeansInit.java
License:Apache License
/** * get the input values and choose the K clusters' centers * * @param dataPath/*w w w. j a v a 2 s . co m*/ * @throws MPI_D_Exception * @throws IOException * @throws MPIException */ @SuppressWarnings("deprecation") private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf) throws MPI_D_Exception, IOException, MPIException { MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config, dataPath, rank); // for record the initialized state for (FileSplit path : inputs) { SequenceFileInputFormat f = new SequenceFileInputFormat(); JobConf jobConf = new JobConf(confPath); Reporter r = new KmeansUtils.EmptyReport(); RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r); Random random = new Random(1000); LongWritable k = reader.createKey(); VectorWritable v = reader.createValue(); IntWritable cluster = new IntWritable(); while (reader.next(k, v)) { cluster.set(random.nextInt(kCluster)); MPI_D.Send(cluster, v); } reader.close(); } } else { IntWritable key = null, newKey = null; VectorWritable point = null, newPoint = null; double sum[] = null; int count = 0; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { sum = new double[newPoint.get().size()]; } else if (!key.equals(newKey)) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals); centers.add(oneCenter); sum = new double[point.get().size()]; count = 0; } key = newKey; point = newPoint; KmeansUtils.accumulate(sum, newPoint.get()); count++; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(key.get(), centerVals); centers.add(oneCenter); } transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.gatherCentersByP2P(centers); if (rank == 0) { OutputStream resOut = KmeansUtils.getOutputStream(outPath, config); DataOutput os = new DataOutputStream(resOut); for (PointVector centerPoint : centers) { os.write((centerPoint.toString() + "\n").getBytes()); } resOut.flush(); resOut.close(); } System.out.println("rank " + rank + " finish"); } MPI_D.Finalize(); }
From source file:mlbench.kmeans.KmeansIter.java
License:Apache License
/** * Calculate the new center iteratively// w w w . jav a 2 s . c o m * * @return true: finish; false: continue * @throws MPI_D_Exception * @throws MPIException * @throws IOException */ @SuppressWarnings("deprecation") private static void iterBody(String args[], HashMap<String, String> conf) throws MPI_D_Exception, MPIException, IOException { MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { System.out.println(centerPath); DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config); String lineVal; try { while ((lineVal = in.readLine()) != null) { String lineSeq[] = lineVal.split(":"); PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1])); centers.add(p); } } catch (IOException e) { e.printStackTrace(); } finally { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } } MPI_D.COMM_BIPARTITE_O.Barrier(); KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.broadcastCenters(centers); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config, dataPath, rank); double centerSum[][] = new double[kCluster][]; long centerPNum[] = new long[kCluster]; // for record the initialized state for (FileSplit path : inputs) { SequenceFileInputFormat f = new SequenceFileInputFormat(); JobConf jobConf = new JobConf(confPath); Reporter r = new KmeansUtils.EmptyReport(); RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r); LongWritable k = reader.createKey(); VectorWritable v = reader.createValue(); while (reader.next(k, v)) { int centerBelong = (int) getBelongPoint(v); // int i = (int) p.getStrClusterClass(); // double[] vals = p.getDoubleValue(); int len = v.get().size(); if (centerSum[centerBelong] == null) { centerSum[centerBelong] = new double[len]; } for (int j = 0; j < len; j++) { centerSum[centerBelong][j] += v.get().get(j); } centerPNum[centerBelong]++; } reader.close(); } for (int i = 0; i < centerPNum.length; i++) { if (centerSum[i] == null && centerPNum[i] == 0) { continue; } MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i])); } } else { centers.clear(); IntWritable key = null, newKey = null; KmeansCenters value = null, newValue = null; double sum[] = null; long count = 0; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newValue = (KmeansCenters) vals[1]; if (key == null && value == null) { sum = new double[newValue.getVector().length]; } else if (!key.equals(newKey)) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = (double) sum[i] / count; } PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals); centers.add(oneCenter); sum = new double[value.getVector().length]; count = 0; } key = newKey; value = newValue; KmeansUtils.accumulate(sum, newValue.getVector()); count += Long.valueOf(newValue.getPointSize()); vals = MPI_D.Recv(); } if (newKey != null && newValue != null) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(key.get(), centerVals); centers.add(oneCenter); } KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.gatherCentersByP2P(centers); if (rank == 0) { OutputStream resOut = KmeansUtils.getOutputStream(outPath, config); DataOutput os = new DataOutputStream(resOut); for (PointVector centerPoint : centers) { os.write((centerPoint.toString() + "\n").getBytes()); } resOut.flush(); resOut.close(); } } MPI_D.Finalize(); }