List of usage examples for org.apache.hadoop.mapred SequenceFileInputFormat SequenceFileInputFormat
public SequenceFileInputFormat()
From source file:eu.stratosphere.addons.parquet.SequenceFileWordCount.java
License:Apache License
@Override public Plan getPlan(String... args) { int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); HadoopDataSource source = new HadoopDataSource(new SequenceFileInputFormat(), new JobConf(), "Input Lines"); SequenceFileInputFormat.addInputPath(source.getJobConf(), new Path(dataInput)); MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper) .name("Count Words").build(); FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts"); CsvOutputFormat.configureRecordFormat(out).recordDelimiter('\n').fieldDelimiter(' ') .field(StringValue.class, 0).field(IntValue.class, 1); Plan plan = new Plan(out, "WordCount Example with a Sequence File as Input"); plan.setDefaultParallelism(numSubTasks); return plan;//w w w. j av a2 s . c o m }
From source file:mlbench.kmeans.KmeansInit.java
License:Apache License
/** * get the input values and choose the K clusters' centers * * @param dataPath// w w w .jav a 2 s. co m * @throws MPI_D_Exception * @throws IOException * @throws MPIException */ @SuppressWarnings("deprecation") private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf) throws MPI_D_Exception, IOException, MPIException { MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config, dataPath, rank); // for record the initialized state for (FileSplit path : inputs) { SequenceFileInputFormat f = new SequenceFileInputFormat(); JobConf jobConf = new JobConf(confPath); Reporter r = new KmeansUtils.EmptyReport(); RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r); Random random = new Random(1000); LongWritable k = reader.createKey(); VectorWritable v = reader.createValue(); IntWritable cluster = new IntWritable(); while (reader.next(k, v)) { cluster.set(random.nextInt(kCluster)); MPI_D.Send(cluster, v); } reader.close(); } } else { IntWritable key = null, newKey = null; VectorWritable point = null, newPoint = null; double sum[] = null; int count = 0; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newPoint = (VectorWritable) vals[1]; if (key == null && point == null) { sum = new double[newPoint.get().size()]; } else if (!key.equals(newKey)) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals); centers.add(oneCenter); sum = new double[point.get().size()]; count = 0; } key = newKey; point = newPoint; KmeansUtils.accumulate(sum, newPoint.get()); count++; vals = MPI_D.Recv(); } if (newKey != null && newPoint != null) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(key.get(), centerVals); centers.add(oneCenter); } transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.gatherCentersByP2P(centers); if (rank == 0) { OutputStream resOut = KmeansUtils.getOutputStream(outPath, config); DataOutput os = new DataOutputStream(resOut); for (PointVector centerPoint : centers) { os.write((centerPoint.toString() + "\n").getBytes()); } resOut.flush(); resOut.close(); } System.out.println("rank " + rank + " finish"); } MPI_D.Finalize(); }
From source file:mlbench.kmeans.KmeansIter.java
License:Apache License
/** * Calculate the new center iteratively// w ww . j a v a 2 s .c o m * * @return true: finish; false: continue * @throws MPI_D_Exception * @throws MPIException * @throws IOException */ @SuppressWarnings("deprecation") private static void iterBody(String args[], HashMap<String, String> conf) throws MPI_D_Exception, MPIException, IOException { MPI_D.Init(args, MPI_D.Mode.Common, conf); if (MPI_D.COMM_BIPARTITE_O != null) { rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O); size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O); if (rank == 0) { System.out.println(centerPath); DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config); String lineVal; try { while ((lineVal = in.readLine()) != null) { String lineSeq[] = lineVal.split(":"); PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1])); centers.add(p); } } catch (IOException e) { e.printStackTrace(); } finally { try { in.close(); } catch (IOException e) { e.printStackTrace(); } } } MPI_D.COMM_BIPARTITE_O.Barrier(); KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.broadcastCenters(centers); FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config, dataPath, rank); double centerSum[][] = new double[kCluster][]; long centerPNum[] = new long[kCluster]; // for record the initialized state for (FileSplit path : inputs) { SequenceFileInputFormat f = new SequenceFileInputFormat(); JobConf jobConf = new JobConf(confPath); Reporter r = new KmeansUtils.EmptyReport(); RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r); LongWritable k = reader.createKey(); VectorWritable v = reader.createValue(); while (reader.next(k, v)) { int centerBelong = (int) getBelongPoint(v); // int i = (int) p.getStrClusterClass(); // double[] vals = p.getDoubleValue(); int len = v.get().size(); if (centerSum[centerBelong] == null) { centerSum[centerBelong] = new double[len]; } for (int j = 0; j < len; j++) { centerSum[centerBelong][j] += v.get().get(j); } centerPNum[centerBelong]++; } reader.close(); } for (int i = 0; i < centerPNum.length; i++) { if (centerSum[i] == null && centerPNum[i] == 0) { continue; } MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i])); } } else { centers.clear(); IntWritable key = null, newKey = null; KmeansCenters value = null, newValue = null; double sum[] = null; long count = 0; Object[] vals = MPI_D.Recv(); while (vals != null) { newKey = (IntWritable) vals[0]; newValue = (KmeansCenters) vals[1]; if (key == null && value == null) { sum = new double[newValue.getVector().length]; } else if (!key.equals(newKey)) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = (double) sum[i] / count; } PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals); centers.add(oneCenter); sum = new double[value.getVector().length]; count = 0; } key = newKey; value = newValue; KmeansUtils.accumulate(sum, newValue.getVector()); count += Long.valueOf(newValue.getPointSize()); vals = MPI_D.Recv(); } if (newKey != null && newValue != null) { double[] centerVals = new double[sum.length]; for (int i = 0; i < centerVals.length; i++) { centerVals[i] = sum[i] / count; } PointVector oneCenter = new PointVector(key.get(), centerVals); centers.add(oneCenter); } KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size); transfer.gatherCentersByP2P(centers); if (rank == 0) { OutputStream resOut = KmeansUtils.getOutputStream(outPath, config); DataOutput os = new DataOutputStream(resOut); for (PointVector centerPoint : centers) { os.write((centerPoint.toString() + "\n").getBytes()); } resOut.flush(); resOut.close(); } } MPI_D.Finalize(); }
From source file:org.apache.hawq.pxf.plugins.hdfs.SequenceFileAccessor.java
License:Apache License
/** * Constructs a SequenceFileAccessor.//from w ww. jav a2 s . co m * * @param input all input parameters coming from the client request */ public SequenceFileAccessor(InputData input) { super(input, new SequenceFileInputFormat<Writable, Writable>()); }
From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java
License:Apache License
@Test(timeout = 5000) public void testSingleSplit() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder(); builder.setGroupingEnabled(false);/* www. ja v a 2 s.c o m*/ builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)); byte[] payload = builder.build().toByteArray(); InputContext inputContext = createTezInputContext(payload); MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); List<Event> eventList = new ArrayList<Event>(); String file1 = "file1"; LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 1); assertEquals(1, splits.length); MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); eventList.clear(); eventList.add(event); input.handleEvents(eventList); int readerCount = 0; for (KeyValueReader reader : input.getKeyValueReaders()) { readerCount++; while (reader.next()) { if (data1.size() == 0) { fail("Found more records than expected"); } Object key = reader.getCurrentKey(); Object val = reader.getCurrentValue(); assertEquals(val, data1.remove(key)); } } assertEquals(1, readerCount); }
From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java
License:Apache License
@Test(timeout = 5000) public void testMultipleSplits() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder(); builder.setGroupingEnabled(false);// w w w . j a va2 s. com builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)); byte[] payload = builder.build().toByteArray(); InputContext inputContext = createTezInputContext(payload); MultiMRInput input = new MultiMRInput(inputContext, 2); input.initialize(); List<Event> eventList = new ArrayList<Event>(); LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>(); String file1 = "file1"; LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10); String file2 = "file2"; LinkedHashMap<LongWritable, Text> data2 = createInputData(localFs, workDir, jobConf, file2, 10, 20); data.putAll(data1); data.putAll(data2); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 2); assertEquals(2, splits.length); MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto1.toByteString().asReadOnlyByteBuffer()); MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]); InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto2.toByteString().asReadOnlyByteBuffer()); eventList.clear(); eventList.add(event1); eventList.add(event2); input.handleEvents(eventList); int readerCount = 0; for (KeyValueReader reader : input.getKeyValueReaders()) { readerCount++; while (reader.next()) { if (data.size() == 0) { fail("Found more records than expected"); } Object key = reader.getCurrentKey(); Object val = reader.getCurrentValue(); assertEquals(val, data.remove(key)); } } assertEquals(2, readerCount); }
From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java
License:Apache License
@Test(timeout = 5000) public void testExtraEvents() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder(); builder.setGroupingEnabled(false);/*from ww w . ja v a2 s.c o m*/ builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)); byte[] payload = builder.build().toByteArray(); InputContext inputContext = createTezInputContext(payload); MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); List<Event> eventList = new ArrayList<Event>(); String file1 = "file1"; createInputData(localFs, workDir, jobConf, file1, 0, 10); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 1); assertEquals(1, splits.length); MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(1, splitProto.toByteString().asReadOnlyByteBuffer()); eventList.clear(); eventList.add(event1); eventList.add(event2); try { input.handleEvents(eventList); fail("Expecting Exception due to too many events"); } catch (Exception e) { assertTrue(e.getMessage().contains("Unexpected event. All physical sources already initialized")); } }
From source file:org.apache.tez.mapreduce.processor.MapUtils.java
License:Apache License
private static InputSplit createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file) throws IOException { FileInputFormat.setInputPaths(job, workDir); LOG.info("Generating data at path: " + file); // create a file with length entries @SuppressWarnings("deprecation") SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class, Text.class); try {/*from w w w . ja v a 2 s .c om*/ Random r = new Random(System.currentTimeMillis()); LongWritable key = new LongWritable(); Text value = new Text(); for (int i = 10; i > 0; i--) { key.set(r.nextInt(1000)); value.set(Integer.toString(i)); writer.append(key, value); LOG.info("<k, v> : <" + key.get() + ", " + value + ">"); } } finally { writer.close(); } SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(job, 1); System.err.println("#split = " + splits.length + " ; " + "#locs = " + splits[0].getLocations().length + "; " + "loc = " + splits[0].getLocations()[0] + "; " + "off = " + splits[0].getLength() + "; " + "file = " + ((FileSplit) splits[0]).getPath()); return splits[0]; }