Example usage for org.apache.hadoop.mapred SequenceFileInputFormat SequenceFileInputFormat

List of usage examples for org.apache.hadoop.mapred SequenceFileInputFormat SequenceFileInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred SequenceFileInputFormat SequenceFileInputFormat.

Prototype

public SequenceFileInputFormat() 

Source Link

Usage

From source file:eu.stratosphere.addons.parquet.SequenceFileWordCount.java

License:Apache License

@Override
public Plan getPlan(String... args) {

    int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String dataInput = (args.length > 1 ? args[1] : "");
    String output = (args.length > 2 ? args[2] : "");

    HadoopDataSource source = new HadoopDataSource(new SequenceFileInputFormat(), new JobConf(), "Input Lines");
    SequenceFileInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));

    MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build();
    ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper)
            .name("Count Words").build();
    FileDataSink out = new FileDataSink(new CsvOutputFormat(), output, reducer, "Word Counts");
    CsvOutputFormat.configureRecordFormat(out).recordDelimiter('\n').fieldDelimiter(' ')
            .field(StringValue.class, 0).field(IntValue.class, 1);

    Plan plan = new Plan(out, "WordCount Example with a Sequence File as Input");
    plan.setDefaultParallelism(numSubTasks);
    return plan;//w  w  w. j  av a2 s .  c o m
}

From source file:mlbench.kmeans.KmeansInit.java

License:Apache License

/**
 * get the input values and choose the K clusters' centers
 *
 * @param dataPath// w  w  w .jav  a 2 s. co m
 * @throws MPI_D_Exception
 * @throws IOException
 * @throws MPIException
 */
@SuppressWarnings("deprecation")
private static void init(String args[], String dataPath, int kCluster, HashMap<String, String> conf)
        throws MPI_D_Exception, IOException, MPIException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);
    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);

            Random random = new Random(1000);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            IntWritable cluster = new IntWritable();
            while (reader.next(k, v)) {
                cluster.set(random.nextInt(kCluster));
                MPI_D.Send(cluster, v);
            }
            reader.close();
        }
    } else {
        IntWritable key = null, newKey = null;
        VectorWritable point = null, newPoint = null;
        double sum[] = null;
        int count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newPoint = (VectorWritable) vals[1];
            if (key == null && point == null) {
                sum = new double[newPoint.get().size()];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[point.get().size()];
                count = 0;
            }
            key = newKey;
            point = newPoint;
            KmeansUtils.accumulate(sum, newPoint.get());
            count++;
            vals = MPI_D.Recv();
        }
        if (newKey != null && newPoint != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }

        System.out.println("rank " + rank + " finish");
    }
    MPI_D.Finalize();
}

From source file:mlbench.kmeans.KmeansIter.java

License:Apache License

/**
 * Calculate the new center iteratively// w  ww . j  a  v a  2 s  .c  o  m
 *
 * @return true: finish; false: continue
 * @throws MPI_D_Exception
 * @throws MPIException
 * @throws IOException
 */
@SuppressWarnings("deprecation")
private static void iterBody(String args[], HashMap<String, String> conf)
        throws MPI_D_Exception, MPIException, IOException {
    MPI_D.Init(args, MPI_D.Mode.Common, conf);

    if (MPI_D.COMM_BIPARTITE_O != null) {
        rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
        size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);

        if (rank == 0) {
            System.out.println(centerPath);
            DataInputStream in = KmeansUtils.readFromHDFSF(new Path(centerPath), config);

            String lineVal;
            try {
                while ((lineVal = in.readLine()) != null) {
                    String lineSeq[] = lineVal.split(":");
                    PointVector p = new PointVector(Integer.valueOf(lineSeq[0]), format(lineSeq[1]));
                    centers.add(p);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    in.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        MPI_D.COMM_BIPARTITE_O.Barrier();

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.broadcastCenters(centers);

        FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, config,
                dataPath, rank);
        double centerSum[][] = new double[kCluster][];
        long centerPNum[] = new long[kCluster];

        // for record the initialized state
        for (FileSplit path : inputs) {
            SequenceFileInputFormat f = new SequenceFileInputFormat();
            JobConf jobConf = new JobConf(confPath);
            Reporter r = new KmeansUtils.EmptyReport();
            RecordReader<LongWritable, VectorWritable> reader = f.getRecordReader(path, jobConf, r);
            LongWritable k = reader.createKey();
            VectorWritable v = reader.createValue();

            while (reader.next(k, v)) {
                int centerBelong = (int) getBelongPoint(v);
                //                    int i = (int) p.getStrClusterClass();
                //                    double[] vals = p.getDoubleValue();
                int len = v.get().size();
                if (centerSum[centerBelong] == null) {
                    centerSum[centerBelong] = new double[len];
                }
                for (int j = 0; j < len; j++) {
                    centerSum[centerBelong][j] += v.get().get(j);
                }
                centerPNum[centerBelong]++;
            }
            reader.close();
        }

        for (int i = 0; i < centerPNum.length; i++) {
            if (centerSum[i] == null && centerPNum[i] == 0) {
                continue;
            }
            MPI_D.Send(new IntWritable(i), new KmeansCenters(centerPNum[i], centerSum[i]));
        }
    } else {
        centers.clear();
        IntWritable key = null, newKey = null;
        KmeansCenters value = null, newValue = null;
        double sum[] = null;
        long count = 0;
        Object[] vals = MPI_D.Recv();
        while (vals != null) {
            newKey = (IntWritable) vals[0];
            newValue = (KmeansCenters) vals[1];
            if (key == null && value == null) {
                sum = new double[newValue.getVector().length];
            } else if (!key.equals(newKey)) {
                double[] centerVals = new double[sum.length];
                for (int i = 0; i < centerVals.length; i++) {
                    centerVals[i] = (double) sum[i] / count;
                }
                PointVector oneCenter = new PointVector(Integer.valueOf(key.toString()), centerVals);
                centers.add(oneCenter);
                sum = new double[value.getVector().length];
                count = 0;
            }
            key = newKey;
            value = newValue;
            KmeansUtils.accumulate(sum, newValue.getVector());
            count += Long.valueOf(newValue.getPointSize());
            vals = MPI_D.Recv();
        }
        if (newKey != null && newValue != null) {
            double[] centerVals = new double[sum.length];
            for (int i = 0; i < centerVals.length; i++) {
                centerVals[i] = sum[i] / count;
            }
            PointVector oneCenter = new PointVector(key.get(), centerVals);
            centers.add(oneCenter);
        }

        KmeansUtils.CenterTransfer transfer = new KmeansUtils.CenterTransfer(config, rank, size);
        transfer.gatherCentersByP2P(centers);

        if (rank == 0) {
            OutputStream resOut = KmeansUtils.getOutputStream(outPath, config);
            DataOutput os = new DataOutputStream(resOut);

            for (PointVector centerPoint : centers) {
                os.write((centerPoint.toString() + "\n").getBytes());
            }
            resOut.flush();
            resOut.close();
        }
    }
    MPI_D.Finalize();
}

From source file:org.apache.hawq.pxf.plugins.hdfs.SequenceFileAccessor.java

License:Apache License

/**
 * Constructs a SequenceFileAccessor.//from  w ww.  jav  a2  s  . co  m
 *
 * @param input all input parameters coming from the client request
 */
public SequenceFileAccessor(InputData input) {
    super(input, new SequenceFileInputFormat<Writable, Writable>());
}

From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java

License:Apache License

@Test(timeout = 5000)
public void testSingleSplit() throws Exception {

    Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);

    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);/* www.  ja v a  2  s.c o  m*/
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();

    InputContext inputContext = createTezInputContext(payload);

    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();

    String file1 = "file1";
    LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10);
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto.toByteString().asReadOnlyByteBuffer());

    eventList.clear();
    eventList.add(event);
    input.handleEvents(eventList);

    int readerCount = 0;
    for (KeyValueReader reader : input.getKeyValueReaders()) {
        readerCount++;
        while (reader.next()) {
            if (data1.size() == 0) {
                fail("Found more records than expected");
            }
            Object key = reader.getCurrentKey();
            Object val = reader.getCurrentValue();
            assertEquals(val, data1.remove(key));
        }
    }
    assertEquals(1, readerCount);
}

From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java

License:Apache License

@Test(timeout = 5000)
public void testMultipleSplits() throws Exception {

    Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);

    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);// w  w w  .  j a va2 s.  com
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();

    InputContext inputContext = createTezInputContext(payload);

    MultiMRInput input = new MultiMRInput(inputContext, 2);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();

    LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>();

    String file1 = "file1";
    LinkedHashMap<LongWritable, Text> data1 = createInputData(localFs, workDir, jobConf, file1, 0, 10);

    String file2 = "file2";
    LinkedHashMap<LongWritable, Text> data2 = createInputData(localFs, workDir, jobConf, file2, 10, 20);

    data.putAll(data1);
    data.putAll(data2);

    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 2);
    assertEquals(2, splits.length);

    MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto1.toByteString().asReadOnlyByteBuffer());

    MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]);
    InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto2.toByteString().asReadOnlyByteBuffer());

    eventList.clear();
    eventList.add(event1);
    eventList.add(event2);
    input.handleEvents(eventList);

    int readerCount = 0;
    for (KeyValueReader reader : input.getKeyValueReaders()) {
        readerCount++;
        while (reader.next()) {
            if (data.size() == 0) {
                fail("Found more records than expected");
            }
            Object key = reader.getCurrentKey();
            Object val = reader.getCurrentValue();
            assertEquals(val, data.remove(key));
        }
    }
    assertEquals(2, readerCount);
}

From source file:org.apache.tez.mapreduce.input.TestMultiMRInput.java

License:Apache License

@Test(timeout = 5000)
public void testExtraEvents() throws Exception {
    Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents");
    JobConf jobConf = new JobConf(defaultConf);
    jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(jobConf, workDir);

    MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder();
    builder.setGroupingEnabled(false);/*from ww w .  ja  v  a2 s.c  o m*/
    builder.setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf));
    byte[] payload = builder.build().toByteArray();

    InputContext inputContext = createTezInputContext(payload);

    MultiMRInput input = new MultiMRInput(inputContext, 1);
    input.initialize();
    List<Event> eventList = new ArrayList<Event>();

    String file1 = "file1";
    createInputData(localFs, workDir, jobConf, file1, 0, 10);
    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]);
    InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0,
            splitProto.toByteString().asReadOnlyByteBuffer());
    InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(1,
            splitProto.toByteString().asReadOnlyByteBuffer());

    eventList.clear();
    eventList.add(event1);
    eventList.add(event2);
    try {
        input.handleEvents(eventList);
        fail("Expecting Exception due to too many events");
    } catch (Exception e) {
        assertTrue(e.getMessage().contains("Unexpected event. All physical sources already initialized"));
    }
}

From source file:org.apache.tez.mapreduce.processor.MapUtils.java

License:Apache License

private static InputSplit createInputSplit(FileSystem fs, Path workDir, JobConf job, Path file)
        throws IOException {
    FileInputFormat.setInputPaths(job, workDir);

    LOG.info("Generating data at path: " + file);
    // create a file with length entries
    @SuppressWarnings("deprecation")
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class, Text.class);
    try {/*from w w w .  ja  v a  2 s  .c  om*/
        Random r = new Random(System.currentTimeMillis());
        LongWritable key = new LongWritable();
        Text value = new Text();
        for (int i = 10; i > 0; i--) {
            key.set(r.nextInt(1000));
            value.set(Integer.toString(i));
            writer.append(key, value);
            LOG.info("<k, v> : <" + key.get() + ", " + value + ">");
        }
    } finally {
        writer.close();
    }

    SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>();
    InputSplit[] splits = format.getSplits(job, 1);
    System.err.println("#split = " + splits.length + " ; " + "#locs = " + splits[0].getLocations().length + "; "
            + "loc = " + splits[0].getLocations()[0] + "; " + "off = " + splits[0].getLength() + "; "
            + "file = " + ((FileSplit) splits[0]).getPath());
    return splits[0];
}