Example usage for org.apache.hadoop.io MapWritable put

List of usage examples for org.apache.hadoop.io MapWritable put

Introduction

In this page you can find the example usage for org.apache.hadoop.io MapWritable put.

Prototype

@Override
    public Writable put(Writable key, Writable value) 

Source Link

Usage

From source file:MRDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    FileSystem fs = null;/*from w  w w .j  av a2  s .  c  o  m*/
    Path samplesMapPath = null;

    float epsilon = Float.parseFloat(args[0]);
    double delta = Double.parseDouble(args[1]);
    int minFreqPercent = Integer.parseInt(args[2]);
    int d = Integer.parseInt(args[3]);
    int datasetSize = Integer.parseInt(args[4]);
    int numSamples = Integer.parseInt(args[5]);
    double phi = Double.parseDouble(args[6]);
    Random rand;

    /************************ Job 1 (local FIM) Configuration ************************/

    JobConf conf = new JobConf(getConf());

    /*
     * Compute the number of required "votes" for an itemsets to be
     * declared frequent    
     */
    // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2.
    int reqApproxNum = (int) Math
            .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1;
    int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi)));
    //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum);

    conf.setInt("PARMM.reducersNum", numSamples);
    conf.setInt("PARMM.datasetSize", datasetSize);
    conf.setInt("PARMM.minFreqPercent", minFreqPercent);
    conf.setInt("PARMM.sampleSize", sampleSize);
    conf.setFloat("PARMM.epsilon", epsilon);

    // Set the number of reducers equal to the number of samples, to
    // maximize parallelism. Required by our Partitioner.
    conf.setNumReduceTasks(numSamples);

    // XXX: why do we disable the speculative execution? MR
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    /* 
     * Enable compression of map output.
     *
     * We do it for this job and not for the aggregation one because
     * each mapper there only print out one record for each itemset,
     * so there isn't much to compress, I'd say. MR
     *
     * In Amazon MapReduce compression of the map output seems to be
     * happen by default and the Snappy codec is used, which is
     * extremely fast.
     */
    conf.setBoolean("mapred.compress.map.output", true);
    //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class);

    conf.setJarByClass(MRDriver.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    // We write the collections found in a reducers as a SequenceFile 
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9]));

    // set the mapper class based on command line option
    switch (Integer.parseInt(args[7])) {
    case 1:
        System.out.println("running partition mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(PartitionMapper.class);
        break;
    case 2:
        System.out.println("running binomial mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(BinomialSamplerMapper.class);
        break;
    case 3:
        System.out.println("running coin mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(CoinFlipSamplerMapper.class);
    case 4:
        System.out.println("running sampler mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(InputSamplerMapper.class);

        // create a random sample of size T*m
        rand = new Random();
        long sampling_start_time = System.nanoTime();
        int[] samples = new int[numSamples * sampleSize];
        for (int i = 0; i < numSamples * sampleSize; i++) {
            samples[i] = rand.nextInt(datasetSize);
        }

        // for each key in the sample, create a list of all T samples to which this key belongs
        Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>();
        for (int i = 0; i < numSamples * sampleSize; i++) {
            ArrayList<IntWritable> sampleIDs = null;
            LongWritable key = new LongWritable(samples[i]);
            if (hashTable.containsKey(key))
                sampleIDs = hashTable.get(key);
            else
                sampleIDs = new ArrayList<IntWritable>();
            sampleIDs.add(new IntWritable(i % numSamples));
            hashTable.put(key, sampleIDs);
        }

        /*
         * Convert the Hastable to a MapWritable which we will
         * write to HDFS and distribute to all Mappers using
         * DistributedCache
         */
        MapWritable map = new MapWritable();
        for (LongWritable key : hashTable.keySet()) {
            ArrayList<IntWritable> sampleIDs = hashTable.get(key);
            IntArrayWritable sampleIDsIAW = new IntArrayWritable();
            sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()]));
            map.put(key, sampleIDsIAW);
        }

        fs = FileSystem.get(URI.create("samplesMap.ser"), conf);
        samplesMapPath = new Path("samplesMap.ser");
        FSDataOutputStream out = fs.create(samplesMapPath, true);
        map.write(out);
        out.sync();
        out.close();
        DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"),
                conf);
        // stop the sampling timer   
        long sampling_end_time = System.nanoTime();
        long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000;
        System.out.println("sampling runtime (milliseconds): " + sampling_runtime);
        break; // end switch case
    case 5:
        System.out.println("running random integer partition mapper...");
        conf.setInputFormat(WholeSplitInputFormat.class);
        Path inputFilePath = new Path(args[8]);
        WholeSplitInputFormat.addInputPath(conf, inputFilePath);
        conf.setMapperClass(RandIntPartSamplerMapper.class);
        // Compute number of map tasks.
        fs = inputFilePath.getFileSystem(conf);
        FileStatus inputFileStatus = fs.getFileStatus(inputFilePath);
        long len = inputFileStatus.getLen();
        long blockSize = inputFileStatus.getBlockSize();
        conf.setLong("mapred.min.split.size", blockSize);
        conf.setLong("mapred.max.split.size", blockSize);
        int mapTasksNum = ((int) (len / blockSize)) + 1;
        conf.setNumMapTasks(mapTasksNum);
        //System.out.println("len: " + len + " blockSize: " 
        //      + blockSize + " mapTasksNum: " + mapTasksNum);
        // Extract random integer partition of total sample
        // size into up to mapTasksNum partitions.
        // XXX I'm not sure this is a correct way to do
        // it.
        rand = new Random();
        IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples];
        for (int j = 0; j < numSamples; j++) {
            IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum];
            int sum = 0;
            int i;
            for (i = 0; i < mapTasksNum - 1; i++) {
                int size = rand.nextInt(sampleSize - sum);
                tempToSampleArr[i] = new IntWritable(size);
                sum += size;
                if (sum > numSamples * sampleSize) {
                    System.out.println("Something went wrong generating the sample Sizes");
                    System.exit(1);
                }
                if (sum == sampleSize) {
                    break;
                }
            }
            if (i == mapTasksNum - 1) {
                tempToSampleArr[i] = new IntWritable(sampleSize - sum);
            } else {
                for (; i < mapTasksNum; i++) {
                    tempToSampleArr[i] = new IntWritable(0);
                }
            }
            Collections.shuffle(Arrays.asList(tempToSampleArr));
            for (i = 0; i < mapTasksNum; i++) {
                toSampleArr[i][j] = tempToSampleArr[i];
            }
        }

        for (int i = 0; i < mapTasksNum; i++) {
            DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i);
        }
        break;
    default:
        System.err.println("Wrong Mapper ID. Can only be in [1,5]");
        System.exit(1);
        break;
    }

    /*
     * We don't use the default hash partitioner because we want to
     * maximize the parallelism. That's why we also fix the number
     * of reducers.
     */
    conf.setPartitionerClass(FIMPartitioner.class);

    conf.setReducerClass(FIMReducer.class);

    /************************ Job 2 (aggregation) Configuration ************************/

    JobConf confAggr = new JobConf(getConf());

    confAggr.setInt("PARMM.reducersNum", numSamples);
    confAggr.setInt("PARMM.reqApproxNum", reqApproxNum);
    confAggr.setInt("PARMM.sampleSize", sampleSize);
    confAggr.setFloat("PARMM.epsilon", epsilon);

    // XXX: Why do we disable speculative execution? MR
    confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    confAggr.setJarByClass(MRDriver.class);

    confAggr.setMapOutputKeyClass(Text.class);
    confAggr.setMapOutputValueClass(DoubleWritable.class);

    confAggr.setOutputKeyClass(Text.class);
    confAggr.setOutputValueClass(Text.class);

    confAggr.setMapperClass(AggregateMapper.class);
    confAggr.setReducerClass(AggregateReducer.class);

    confAggr.setInputFormat(CombineSequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9]));

    FileOutputFormat.setOutputPath(confAggr, new Path(args[10]));

    long FIMjob_start_time = System.currentTimeMillis();
    RunningJob FIMjob = JobClient.runJob(conf);
    long FIMjob_end_time = System.currentTimeMillis();

    RunningJob aggregateJob = JobClient.runJob(confAggr);
    long aggrJob_end_time = System.currentTimeMillis();

    long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time;

    long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time;

    if (args[7].equals("4")) {
        // Remove samplesMap file 
        fs.delete(samplesMapPath, false);
    }

    Counters counters = FIMjob.getCounters();
    Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart");
    long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()];
    int i = 0;
    for (Counters.Counter counter : FIMMapperStartTimesCounters) {
        FIMMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd");
    long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMMapperEndTimesCounters) {
        FIMMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart");
    long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerStartTimesCounters) {
        FIMReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd");
    long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerEndTimesCounters) {
        FIMReducerEndTimes[i++] = counter.getCounter();
    }

    Counters countersAggr = aggregateJob.getCounters();
    Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart");
    long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperStartTimesCounters) {
        AggregateMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd");
    long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperEndTimesCounters) {
        AggregateMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart");
    long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerStartTimesCounters) {
        AggregateReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd");
    long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerEndTimesCounters) {
        AggregateReducerEndTimes[i++] = counter.getCounter();
    }

    long FIMMapperStartMin = FIMMapperStartTimes[0];
    for (long l : FIMMapperStartTimes) {
        if (l < FIMMapperStartMin) {
            FIMMapperStartMin = l;
        }
    }
    long FIMMapperEndMax = FIMMapperEndTimes[0];
    for (long l : FIMMapperEndTimes) {
        if (l > FIMMapperEndMax) {
            FIMMapperEndMax = l;
        }
    }
    System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time));
    System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin));
    long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length];
    long FIMMapperRunTimesSum = 0;
    for (int l = 0; l < FIMMapperStartTimes.length; l++) {
        FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l];
        FIMMapperRunTimesSum += FIMMapperRunTimes[l];
    }
    System.out.println("FIMMapper average task runtime (milliseconds): "
            + FIMMapperRunTimesSum / FIMMapperStartTimes.length);
    long FIMMapperRunTimesMin = FIMMapperRunTimes[0];
    long FIMMapperRunTimesMax = FIMMapperRunTimes[0];
    for (long l : FIMMapperRunTimes) {
        if (l < FIMMapperRunTimesMin) {
            FIMMapperRunTimesMin = l;
        }
        if (l > FIMMapperRunTimesMax) {
            FIMMapperRunTimesMax = l;
        }
    }
    System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin);
    System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax);

    long FIMReducerStartMin = FIMReducerStartTimes[0];
    for (long l : FIMReducerStartTimes) {
        if (l < FIMReducerStartMin) {
            FIMReducerStartMin = l;
        }
    }
    long FIMReducerEndMax = FIMReducerEndTimes[0];
    for (long l : FIMReducerEndTimes) {
        if (l > FIMReducerEndMax) {
            FIMReducerEndMax = l;
        }
    }
    System.out
            .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax));
    System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin));
    long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length];
    long FIMReducerRunTimesSum = 0;
    for (int l = 0; l < FIMReducerStartTimes.length; l++) {
        FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l];
        FIMReducerRunTimesSum += FIMReducerRunTimes[l];
    }
    System.out.println("FIMReducer average task runtime (milliseconds): "
            + FIMReducerRunTimesSum / FIMReducerStartTimes.length);
    long FIMReducerRunTimesMin = FIMReducerRunTimes[0];
    long FIMReducerRunTimesMax = FIMReducerRunTimes[0];
    for (long l : FIMReducerRunTimes) {
        if (l < FIMReducerRunTimesMin) {
            FIMReducerRunTimesMin = l;
        }
        if (l > FIMReducerRunTimesMax) {
            FIMReducerRunTimesMax = l;
        }
    }
    System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin);
    System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax);
    System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax));

    long AggregateMapperStartMin = AggregateMapperStartTimes[0];
    for (long l : AggregateMapperStartTimes) {
        if (l < AggregateMapperStartMin) {
            AggregateMapperStartMin = l;
        }
    }
    long AggregateMapperEndMax = AggregateMapperEndTimes[0];
    for (long l : AggregateMapperEndTimes) {
        if (l > AggregateMapperEndMax) {
            AggregateMapperEndMax = l;
        }
    }
    System.out.println(
            "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time));
    System.out.println("AggregateMapper total runtime (milliseconds): "
            + (AggregateMapperEndMax - AggregateMapperStartMin));
    long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length];
    long AggregateMapperRunTimesSum = 0;
    for (int l = 0; l < AggregateMapperStartTimes.length; l++) {
        AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l];
        AggregateMapperRunTimesSum += AggregateMapperRunTimes[l];
    }
    System.out.println("AggregateMapper average task runtime (milliseconds): "
            + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length);
    long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0];
    long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0];
    for (long l : AggregateMapperRunTimes) {
        if (l < AggregateMapperRunTimesMin) {
            AggregateMapperRunTimesMin = l;
        }
        if (l > AggregateMapperRunTimesMax) {
            AggregateMapperRunTimesMax = l;
        }
    }
    System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin);
    System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax);

    long AggregateReducerStartMin = AggregateReducerStartTimes[0];
    for (long l : AggregateReducerStartTimes) {
        if (l < AggregateReducerStartMin) {
            AggregateReducerStartMin = l;
        }
    }
    long AggregateReducerEndMax = AggregateReducerEndTimes[0];
    for (long l : AggregateReducerEndTimes) {
        if (l > AggregateReducerEndMax) {
            AggregateReducerEndMax = l;
        }
    }
    System.out.println("Aggregate job round shuffle phase runtime (milliseconds): "
            + (AggregateReducerStartMin - AggregateMapperEndMax));
    System.out.println("AggregateReducer total runtime (milliseconds): "
            + (AggregateReducerEndMax - AggregateReducerStartMin));
    long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length];
    long AggregateReducerRunTimesSum = 0;
    for (int l = 0; l < AggregateReducerStartTimes.length; l++) {
        AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l];
        AggregateReducerRunTimesSum += AggregateReducerRunTimes[l];
    }
    System.out.println("AggregateReducer average task runtime (milliseconds): "
            + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length);
    long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0];
    long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0];
    for (long l : AggregateReducerRunTimes) {
        if (l < AggregateReducerRunTimesMin) {
            AggregateReducerRunTimesMin = l;
        }
        if (l > AggregateReducerRunTimesMax) {
            AggregateReducerRunTimesMax = l;
        }
    }
    System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin);
    System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax);

    System.out.println(
            "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax));

    System.out
            .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time));
    System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): "
            + (AggregateReducerEndMax - FIMMapperStartMin));
    System.out.println("total runtime (no setups, no cooldowns) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin));
    System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime);
    System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin));
    System.out.println(
            "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime);
    System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): "
            + (AggregateReducerEndMax - AggregateMapperStartMin));

    return 0;
}

From source file:com.csiro.hadoop.WritableTest.java

public static void main(String[] args) {
    System.out.println("*** Primitive Writable ***");

    BooleanWritable bool1 = new BooleanWritable(true);
    ByteWritable byte1 = new ByteWritable((byte) 3);
    System.out.printf("Boolean:%s Byte:%d\n", bool1, byte1.get());

    IntWritable int1 = new IntWritable(5);
    IntWritable int2 = new IntWritable(17);
    System.out.printf("I1:%d I2:%d\n", int1.get(), int2.get());

    int1.set(int2.get());
    System.out.printf("I1:%d I2:%d\n", int1.get(), int2.get());

    Integer int3 = new Integer(23);
    int1.set(int3);
    System.out.printf("I1:%d I2:%d\n", int1.get(), int2.get());

    System.out.println("*** Array Writable ***");

    ArrayWritable a = new ArrayWritable(IntWritable.class);
    a.set(new IntWritable[] { new IntWritable(1), new IntWritable(3), new IntWritable(5) });

    IntWritable[] values = (IntWritable[]) a.get();
    for (IntWritable i : values) {
        System.out.println(i);//from  w ww .  ja  va  2  s  . c  o  m
    }

    IntArrayWritable ia = new IntArrayWritable();
    ia.set(new IntWritable[] { new IntWritable(1), new IntWritable(3), new IntWritable(5) });

    IntWritable[] ivalues = (IntWritable[]) ia.get();

    ia.set((new LongWritable[] { new LongWritable(10001) }));

    System.out.println("*** Map Writables ***");

    MapWritable m = new MapWritable();
    IntWritable key1 = new IntWritable(5);
    NullWritable value1 = NullWritable.get();

    m.put(key1, value1);
    System.out.println(m.containsKey(key1));
    System.out.println(m.get(key1));
    m.put(new LongWritable(100000000), key1);
    Set<Writable> keys = m.keySet();

    for (Writable k : keys)
        System.out.println(k.getClass());

}

From source file:com.dasasian.chok.lucene.LuceneServer.java

License:Apache License

@Override
public MapWritable getDetails(final String[] shards, final int docId, final String[] fieldNames)
        throws IOException {
    final MapWritable result = new MapWritable();
    final Document doc = doc(shards[0], docId, fieldNames);
    final List<Fieldable> fields = doc.getFields();
    for (final Fieldable field : fields) {
        final String name = field.name();
        if (field.isBinary()) {
            final byte[] binaryValue = field.getBinaryValue();
            result.put(new Text(name), new BytesWritable(binaryValue));
        } else {//w w  w  .  j a va  2s . c o  m
            final String stringValue = field.stringValue();
            result.put(new Text(name), new Text(stringValue));
        }
    }
    return result;
}

From source file:com.davidgildeh.hadoop.input.simpledb.SimpleDBRecordReader.java

License:Apache License

/**
 * Get next Key/Value Record (Tuple) from the Split
 * //w  w w  . j a v a2s. com
 * @param key           The key to set
 * @param value         The HashMap value to set
 * @return              True - next Item available, False - No more items available
 * @throws IOException 
 */
public boolean next(Text key, MapWritable value) throws IOException {

    // Get next item off the ArrayList unless we're at the end
    if (cursor < split.getLength()) {

        Item item = items.get(cursor++);

        key.set(item.getName());
        for (Attribute attribute : item.getAttributes()) {
            value.put(new Text(attribute.getName()), new Text(attribute.getValue()));
        }

        if (LOG.isDebugEnabled()) {
            LOG.debug("Sending next record to Mappers: " + key.toString());
        }

        return true;
    } else {
        return false;
    }
}

From source file:com.digitalpebble.behemoth.gate.GATEProcessor.java

License:Apache License

public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null)
        reporter.setStatus("GATE : " + inputDoc.getUrl().toString());

    boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false);

    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {//ww w.  j  a  v  a 2  s. co m

        gatedocument = generateGATEDoc(inputDoc);
        // add it to the current corpus
        corpus.add(gatedocument);
        // get the application and assign the corpus to it
        this.GATEapplication.setCorpus(corpus);
        // process it with GATE
        this.GATEapplication.execute();

        AnnotationSet annots = null;
        if ("".equals(filters.getAnnotationSetName()))
            annots = gatedocument.getAnnotations();
        else
            annots = gatedocument.getAnnotations(filters.getAnnotationSetName());

        // enrich the input doc with the annotations from
        // the GATE application
        // transfer the annotations from the GATE document
        // to the Behemoth one using the filters
        List<com.digitalpebble.behemoth.Annotation> beheannotations = convertGATEAnnotationsToBehemoth(annots,
                inputDoc);

        // sort the annotations before adding them?
        Collections.sort(beheannotations);

        // clear the existing behemoth annotations
        if (clearBehemothAnnotations) {
            inputDoc.getAnnotations().clear();
        }

        inputDoc.getAnnotations().addAll(beheannotations);

        // add counters about num of annotations added
        if (reporter != null)
            for (com.digitalpebble.behemoth.Annotation annot : beheannotations) {
                reporter.incrCounter("GATE", annot.getType(), 1);
            }

        // Add the document features from GATE to Behemoth
        Set<String> docFeatFilter = this.filters.getDocFeaturesFilter();
        MapWritable beheMD = inputDoc.getMetadata(true);
        if (docFeatFilter.size() > 0) {
            for (String docFeatName : docFeatFilter) {
                Object featValue = gatedocument.getFeatures().get(docFeatName);
                if (featValue != null) {
                    beheMD.put(new Text(docFeatName), new Text(featValue.toString()));
                }
            }
        }

        if (reporter != null)
            reporter.incrCounter("GATE", "Document", 1);

    } catch (Exception e) {
        LOG.error(inputDoc.getUrl().toString(), e);
        if (reporter != null)
            reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
        // remove the document from the corpus again
        corpus.clear();
        // and from memory
        if (gatedocument != null)
            Factory.deleteResource(gatedocument);
    }
    // currently returns only the input document
    return new BehemothDocument[] { inputDoc };
}

From source file:com.digitalpebble.behemoth.tika.TikaProcessor.java

License:Apache License

/**
 * Classes that wish to handle Metadata separately may override this method
 * //ww w .  ja va2s.  c o  m
 * @param metadata
 *            the extracted {@link org.apache.tika.metadata.Metadata}
 */
protected void processMetadata(BehemothDocument inputDoc, Metadata metadata) {
    MapWritable mapW = inputDoc.getMetadata(true);
    for (String name : metadata.names()) {
        String[] values = metadata.getValues(name);
        // temporary fix to avoid
        // Exception in thread "main" java.io.IOException: can't find class:
        // com.digitalpebble.behemoth.tika.TextArrayWritable because
        // com.digitalpebble.behemoth.tika.TextArrayWritable
        // at
        // org.apache.hadoop.io.AbstractMapWritable.readFields(AbstractMapWritable.java:204)
        // simply store multiple values as a , separated Text
        StringBuffer buff = new StringBuffer();
        for (int i = 0; i < values.length; i++) {
            if (i > 0)
                buff.append(",");
            buff.append(values[i]);
        }
        // TODO prefix md?
        mapW.put(new Text(name), new Text(buff.toString()));
        // mapW.put(new Text(name), new TextArrayWritable(values));
    }
    inputDoc.setMetadata(mapW);
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java

License:Apache License

@Test
public void test_Aleph2EsRecordReader_objectConversion() throws IOException, InterruptedException {

    @SuppressWarnings("rawtypes")
    final RecordReader mock_shard_record_reader = Mockito.mock(RecordReader.class);

    // mock returns Text key, MapWritable value
    Mockito.when(mock_shard_record_reader.getCurrentKey()).thenReturn(new Text("text_test"));

    final MapWritable test_out = new MapWritable();
    test_out.put(new Text("val_key_text"), new Text("val_val_text"));

    Mockito.when(mock_shard_record_reader.getCurrentValue()).thenReturn(test_out);

    final Aleph2EsRecordReader reader_under_test = new Aleph2EsRecordReader(mock_shard_record_reader);

    final String key = reader_under_test.getCurrentKey();
    assertEquals(String.class, key.getClass());
    assertEquals("text_test", key);

    final Tuple2<Long, IBatchRecord> value = reader_under_test.getCurrentValue();
    assertEquals(0L, value._1().longValue()); // (so something breaks in here when/if we put some logic in)
    assertEquals(Optional.empty(), value._2().getContent());
    final JsonNode json_val = value._2().getJson();
    assertTrue("Is object: " + json_val, json_val.isObject());
    assertEquals("val_val_text", json_val.get("val_key_text").asText());
    assertEquals("text_test", json_val.get("_id").asText());
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.utils.TestJsonNodeWritableUtils.java

License:Apache License

@SuppressWarnings("deprecation")
@Test// www.  j a  v  a2  s. c  o m
public void test_mapWritableWrapper() {
    final ObjectMapper mapper = BeanTemplateUtils.configureMapper(Optional.empty());

    final MapWritable m1 = new MapWritable();

    m1.put(new Text("test1"), new BooleanWritable(true));

    final MapWritable m2 = new MapWritable();
    m2.put(new Text("nested"), m1);
    m2.put(new Text("test2"), new Text("test2"));

    final ArrayWritable a1 = new ArrayWritable(IntWritable.class);
    a1.set(new Writable[] { new IntWritable(4), new IntWritable(5) });

    final ArrayWritable a2 = new ArrayWritable(MapWritable.class);
    a2.set(new Writable[] { m1, m1 });

    m2.put(new Text("array"), a2);
    m1.put(new Text("array"), a1);

    final JsonNode j2 = JsonNodeWritableUtils.from(m2);

    assertEquals(3, j2.size());

    // Check j's contents
    assertEquals(Stream.of("nested", "test2", "array").sorted().collect(Collectors.toList()),
            Optionals.streamOf(j2.fieldNames(), false).sorted().collect(Collectors.toList()));
    assertEquals("test2", j2.get("test2").asText());

    final JsonNode j1 = j2.get("nested");
    assertEquals(2, j1.size());
    final JsonNode j1b = JsonNodeWritableUtils.from(m1);
    assertTrue("{\"test1\":true,\"array\":[4,5]}".equals(j1b.toString())
            || "{\"array\":[4,5],\"test1\":true}".equals(j1b.toString())); //(tests entrySet)
    final ArrayNode an = mapper.createArrayNode();
    an.add(mapper.convertValue(4, JsonNode.class));
    an.add(mapper.convertValue(5, JsonNode.class));
    assertEquals(Arrays.asList(mapper.convertValue(true, JsonNode.class), an),
            Optionals.streamOf(((ObjectNode) j1).elements(), false).collect(Collectors.toList()));

    // OK, now test adding:

    assertEquals(2, j1.size());

    final ObjectNode o1 = (ObjectNode) j1;
    o1.put("added", "added_this");

    final ObjectNodeWrapper o1c = (ObjectNodeWrapper) o1;
    assertFalse(o1c.containsKey("not_present"));
    assertTrue(o1c.containsKey("added"));
    assertTrue(o1c.containsKey("test1"));

    assertEquals(Stream.of("test1", "array", "added").sorted().collect(Collectors.toList()),
            Optionals.streamOf(j1.fieldNames(), false).sorted().collect(Collectors.toList()));
    assertEquals(
            Arrays.asList(mapper.convertValue(true, JsonNode.class), an,
                    mapper.convertValue("added_this", JsonNode.class)),
            Optionals.streamOf(((ObjectNode) j1).elements(), false).collect(Collectors.toList()));
    assertTrue(j1.toString().contains("added_this"));
    assertTrue(j1.toString().contains("4,5"));

    assertEquals(mapper.convertValue("added_this", JsonNode.class), j1.get("added"));

    assertEquals(3, j1.size());

    // OK now test removing:

    assertEquals(null, o1.remove("not_present"));
    assertEquals(mapper.convertValue(true, JsonNode.class), o1.remove("test1"));
    assertEquals(2, o1.size());
    ObjectNode o1b = o1.remove(Arrays.asList("added", "array"));
    assertEquals(0, o1.size());
    assertEquals(0, o1b.size());

    o1.putAll(JsonNodeWritableUtils.from(m1)); // will be minus one object
    assertEquals(2, o1.size());
    assertTrue(o1c.containsValue(mapper.convertValue(true, JsonNode.class)));
    assertFalse(o1c.containsValue("banana"));

    final ObjectNodeWrapper o2 = (ObjectNodeWrapper) JsonNodeWritableUtils.from(m2);
    assertFalse(o2.isEmpty());
    assertTrue(o2.containsKey("array"));
    assertFalse(o2.containsValue("array"));
    assertTrue(o2.containsValue(mapper.convertValue("test2", JsonNode.class)));
    assertEquals(TextNode.class, o2.remove("test2").getClass());
    assertEquals(2, o2.size());
    o2.removeAll();
    assertEquals(0, o2.size());
}

From source file:com.jfolson.hive.serde.RTypedBytesWritableInput.java

License:Apache License

public MapWritable readMap(MapWritable mw) throws IOException {
    if (mw == null) {
        mw = new MapWritable();
    }//from   w w  w  .ja  v a  2  s .com
    int length = in.readMapHeader();
    for (int i = 0; i < length; i++) {
        Writable key = read();
        Writable value = read();
        mw.put(key, value);
    }
    return mw;
}

From source file:com.redgate.hadoop.hive.azuretables.AzureTablesRecordReader.java

License:Apache License

/**
 * Grabs the next result and process the DynamicTableEntity into a Hive
 * friendly MapWriteable//from w  w w  .jav  a  2 s. co m
 * 
 * @param key
 *            The RowID for the entity. Not that this is not really an Azure
 *            key, since the partition is implicit in the key
 * @param value
 *            A MapWriteable which will be populated with values from the
 *            DynamicTableEntity returned by the Azure query.
 */
public boolean next(Text key, MapWritable value) throws IOException {
    if (!results.hasNext())
        return false;
    DynamicTableEntity entity = results.next();
    key.set(entity.getRowKey());
    for (Entry<String, EntityProperty> entry : entity.getProperties().entrySet()) {

        final EntityProperty property = entry.getValue();
        // Note that azure table entity keys are forced to lower case for
        // matching with hive column names
        final String propertyKey = entry.getKey().toLowerCase();
        final String propertyValue = property.getValueAsString();
        final Writable writableValue = SERIALIZED_NULL.equals(propertyValue) ? NullWritable.get()
                : new Text(propertyValue);
        value.put(new Text(propertyKey), writableValue);
    }
    pos++;
    return true;
}