List of usage examples for org.apache.hadoop.io MapWritable put
@Override
public Writable put(Writable key, Writable value)
From source file:MRDriver.java
License:Apache License
public int run(String args[]) throws Exception { FileSystem fs = null;/*from w w w .j av a2 s . c o m*/ Path samplesMapPath = null; float epsilon = Float.parseFloat(args[0]); double delta = Double.parseDouble(args[1]); int minFreqPercent = Integer.parseInt(args[2]); int d = Integer.parseInt(args[3]); int datasetSize = Integer.parseInt(args[4]); int numSamples = Integer.parseInt(args[5]); double phi = Double.parseDouble(args[6]); Random rand; /************************ Job 1 (local FIM) Configuration ************************/ JobConf conf = new JobConf(getConf()); /* * Compute the number of required "votes" for an itemsets to be * declared frequent */ // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2. int reqApproxNum = (int) Math .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1; int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi))); //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum); conf.setInt("PARMM.reducersNum", numSamples); conf.setInt("PARMM.datasetSize", datasetSize); conf.setInt("PARMM.minFreqPercent", minFreqPercent); conf.setInt("PARMM.sampleSize", sampleSize); conf.setFloat("PARMM.epsilon", epsilon); // Set the number of reducers equal to the number of samples, to // maximize parallelism. Required by our Partitioner. conf.setNumReduceTasks(numSamples); // XXX: why do we disable the speculative execution? MR conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI); /* * Enable compression of map output. * * We do it for this job and not for the aggregation one because * each mapper there only print out one record for each itemset, * so there isn't much to compress, I'd say. MR * * In Amazon MapReduce compression of the map output seems to be * happen by default and the Snappy codec is used, which is * extremely fast. */ conf.setBoolean("mapred.compress.map.output", true); //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class); conf.setJarByClass(MRDriver.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); // We write the collections found in a reducers as a SequenceFile conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9])); // set the mapper class based on command line option switch (Integer.parseInt(args[7])) { case 1: System.out.println("running partition mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(PartitionMapper.class); break; case 2: System.out.println("running binomial mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(BinomialSamplerMapper.class); break; case 3: System.out.println("running coin mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(CoinFlipSamplerMapper.class); case 4: System.out.println("running sampler mapper..."); SequenceFileInputFormat.addInputPath(conf, new Path(args[8])); conf.setMapperClass(InputSamplerMapper.class); // create a random sample of size T*m rand = new Random(); long sampling_start_time = System.nanoTime(); int[] samples = new int[numSamples * sampleSize]; for (int i = 0; i < numSamples * sampleSize; i++) { samples[i] = rand.nextInt(datasetSize); } // for each key in the sample, create a list of all T samples to which this key belongs Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>(); for (int i = 0; i < numSamples * sampleSize; i++) { ArrayList<IntWritable> sampleIDs = null; LongWritable key = new LongWritable(samples[i]); if (hashTable.containsKey(key)) sampleIDs = hashTable.get(key); else sampleIDs = new ArrayList<IntWritable>(); sampleIDs.add(new IntWritable(i % numSamples)); hashTable.put(key, sampleIDs); } /* * Convert the Hastable to a MapWritable which we will * write to HDFS and distribute to all Mappers using * DistributedCache */ MapWritable map = new MapWritable(); for (LongWritable key : hashTable.keySet()) { ArrayList<IntWritable> sampleIDs = hashTable.get(key); IntArrayWritable sampleIDsIAW = new IntArrayWritable(); sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()])); map.put(key, sampleIDsIAW); } fs = FileSystem.get(URI.create("samplesMap.ser"), conf); samplesMapPath = new Path("samplesMap.ser"); FSDataOutputStream out = fs.create(samplesMapPath, true); map.write(out); out.sync(); out.close(); DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"), conf); // stop the sampling timer long sampling_end_time = System.nanoTime(); long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000; System.out.println("sampling runtime (milliseconds): " + sampling_runtime); break; // end switch case case 5: System.out.println("running random integer partition mapper..."); conf.setInputFormat(WholeSplitInputFormat.class); Path inputFilePath = new Path(args[8]); WholeSplitInputFormat.addInputPath(conf, inputFilePath); conf.setMapperClass(RandIntPartSamplerMapper.class); // Compute number of map tasks. fs = inputFilePath.getFileSystem(conf); FileStatus inputFileStatus = fs.getFileStatus(inputFilePath); long len = inputFileStatus.getLen(); long blockSize = inputFileStatus.getBlockSize(); conf.setLong("mapred.min.split.size", blockSize); conf.setLong("mapred.max.split.size", blockSize); int mapTasksNum = ((int) (len / blockSize)) + 1; conf.setNumMapTasks(mapTasksNum); //System.out.println("len: " + len + " blockSize: " // + blockSize + " mapTasksNum: " + mapTasksNum); // Extract random integer partition of total sample // size into up to mapTasksNum partitions. // XXX I'm not sure this is a correct way to do // it. rand = new Random(); IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples]; for (int j = 0; j < numSamples; j++) { IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum]; int sum = 0; int i; for (i = 0; i < mapTasksNum - 1; i++) { int size = rand.nextInt(sampleSize - sum); tempToSampleArr[i] = new IntWritable(size); sum += size; if (sum > numSamples * sampleSize) { System.out.println("Something went wrong generating the sample Sizes"); System.exit(1); } if (sum == sampleSize) { break; } } if (i == mapTasksNum - 1) { tempToSampleArr[i] = new IntWritable(sampleSize - sum); } else { for (; i < mapTasksNum; i++) { tempToSampleArr[i] = new IntWritable(0); } } Collections.shuffle(Arrays.asList(tempToSampleArr)); for (i = 0; i < mapTasksNum; i++) { toSampleArr[i][j] = tempToSampleArr[i]; } } for (int i = 0; i < mapTasksNum; i++) { DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i); } break; default: System.err.println("Wrong Mapper ID. Can only be in [1,5]"); System.exit(1); break; } /* * We don't use the default hash partitioner because we want to * maximize the parallelism. That's why we also fix the number * of reducers. */ conf.setPartitionerClass(FIMPartitioner.class); conf.setReducerClass(FIMReducer.class); /************************ Job 2 (aggregation) Configuration ************************/ JobConf confAggr = new JobConf(getConf()); confAggr.setInt("PARMM.reducersNum", numSamples); confAggr.setInt("PARMM.reqApproxNum", reqApproxNum); confAggr.setInt("PARMM.sampleSize", sampleSize); confAggr.setFloat("PARMM.epsilon", epsilon); // XXX: Why do we disable speculative execution? MR confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false); confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI); confAggr.setJarByClass(MRDriver.class); confAggr.setMapOutputKeyClass(Text.class); confAggr.setMapOutputValueClass(DoubleWritable.class); confAggr.setOutputKeyClass(Text.class); confAggr.setOutputValueClass(Text.class); confAggr.setMapperClass(AggregateMapper.class); confAggr.setReducerClass(AggregateReducer.class); confAggr.setInputFormat(CombineSequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9])); FileOutputFormat.setOutputPath(confAggr, new Path(args[10])); long FIMjob_start_time = System.currentTimeMillis(); RunningJob FIMjob = JobClient.runJob(conf); long FIMjob_end_time = System.currentTimeMillis(); RunningJob aggregateJob = JobClient.runJob(confAggr); long aggrJob_end_time = System.currentTimeMillis(); long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time; long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time; if (args[7].equals("4")) { // Remove samplesMap file fs.delete(samplesMapPath, false); } Counters counters = FIMjob.getCounters(); Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart"); long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()]; int i = 0; for (Counters.Counter counter : FIMMapperStartTimesCounters) { FIMMapperStartTimes[i++] = counter.getCounter(); } Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd"); long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMMapperEndTimesCounters) { FIMMapperEndTimes[i++] = counter.getCounter(); } Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart"); long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMReducerStartTimesCounters) { FIMReducerStartTimes[i++] = counter.getCounter(); } Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd"); long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : FIMReducerEndTimesCounters) { FIMReducerEndTimes[i++] = counter.getCounter(); } Counters countersAggr = aggregateJob.getCounters(); Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart"); long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateMapperStartTimesCounters) { AggregateMapperStartTimes[i++] = counter.getCounter(); } Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd"); long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateMapperEndTimesCounters) { AggregateMapperEndTimes[i++] = counter.getCounter(); } Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart"); long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateReducerStartTimesCounters) { AggregateReducerStartTimes[i++] = counter.getCounter(); } Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd"); long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()]; i = 0; for (Counters.Counter counter : AggregateReducerEndTimesCounters) { AggregateReducerEndTimes[i++] = counter.getCounter(); } long FIMMapperStartMin = FIMMapperStartTimes[0]; for (long l : FIMMapperStartTimes) { if (l < FIMMapperStartMin) { FIMMapperStartMin = l; } } long FIMMapperEndMax = FIMMapperEndTimes[0]; for (long l : FIMMapperEndTimes) { if (l > FIMMapperEndMax) { FIMMapperEndMax = l; } } System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time)); System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin)); long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length]; long FIMMapperRunTimesSum = 0; for (int l = 0; l < FIMMapperStartTimes.length; l++) { FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l]; FIMMapperRunTimesSum += FIMMapperRunTimes[l]; } System.out.println("FIMMapper average task runtime (milliseconds): " + FIMMapperRunTimesSum / FIMMapperStartTimes.length); long FIMMapperRunTimesMin = FIMMapperRunTimes[0]; long FIMMapperRunTimesMax = FIMMapperRunTimes[0]; for (long l : FIMMapperRunTimes) { if (l < FIMMapperRunTimesMin) { FIMMapperRunTimesMin = l; } if (l > FIMMapperRunTimesMax) { FIMMapperRunTimesMax = l; } } System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin); System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax); long FIMReducerStartMin = FIMReducerStartTimes[0]; for (long l : FIMReducerStartTimes) { if (l < FIMReducerStartMin) { FIMReducerStartMin = l; } } long FIMReducerEndMax = FIMReducerEndTimes[0]; for (long l : FIMReducerEndTimes) { if (l > FIMReducerEndMax) { FIMReducerEndMax = l; } } System.out .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax)); System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin)); long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length]; long FIMReducerRunTimesSum = 0; for (int l = 0; l < FIMReducerStartTimes.length; l++) { FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l]; FIMReducerRunTimesSum += FIMReducerRunTimes[l]; } System.out.println("FIMReducer average task runtime (milliseconds): " + FIMReducerRunTimesSum / FIMReducerStartTimes.length); long FIMReducerRunTimesMin = FIMReducerRunTimes[0]; long FIMReducerRunTimesMax = FIMReducerRunTimes[0]; for (long l : FIMReducerRunTimes) { if (l < FIMReducerRunTimesMin) { FIMReducerRunTimesMin = l; } if (l > FIMReducerRunTimesMax) { FIMReducerRunTimesMax = l; } } System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin); System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax); System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax)); long AggregateMapperStartMin = AggregateMapperStartTimes[0]; for (long l : AggregateMapperStartTimes) { if (l < AggregateMapperStartMin) { AggregateMapperStartMin = l; } } long AggregateMapperEndMax = AggregateMapperEndTimes[0]; for (long l : AggregateMapperEndTimes) { if (l > AggregateMapperEndMax) { AggregateMapperEndMax = l; } } System.out.println( "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time)); System.out.println("AggregateMapper total runtime (milliseconds): " + (AggregateMapperEndMax - AggregateMapperStartMin)); long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length]; long AggregateMapperRunTimesSum = 0; for (int l = 0; l < AggregateMapperStartTimes.length; l++) { AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l]; AggregateMapperRunTimesSum += AggregateMapperRunTimes[l]; } System.out.println("AggregateMapper average task runtime (milliseconds): " + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length); long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0]; long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0]; for (long l : AggregateMapperRunTimes) { if (l < AggregateMapperRunTimesMin) { AggregateMapperRunTimesMin = l; } if (l > AggregateMapperRunTimesMax) { AggregateMapperRunTimesMax = l; } } System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin); System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax); long AggregateReducerStartMin = AggregateReducerStartTimes[0]; for (long l : AggregateReducerStartTimes) { if (l < AggregateReducerStartMin) { AggregateReducerStartMin = l; } } long AggregateReducerEndMax = AggregateReducerEndTimes[0]; for (long l : AggregateReducerEndTimes) { if (l > AggregateReducerEndMax) { AggregateReducerEndMax = l; } } System.out.println("Aggregate job round shuffle phase runtime (milliseconds): " + (AggregateReducerStartMin - AggregateMapperEndMax)); System.out.println("AggregateReducer total runtime (milliseconds): " + (AggregateReducerEndMax - AggregateReducerStartMin)); long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length]; long AggregateReducerRunTimesSum = 0; for (int l = 0; l < AggregateReducerStartTimes.length; l++) { AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l]; AggregateReducerRunTimesSum += AggregateReducerRunTimes[l]; } System.out.println("AggregateReducer average task runtime (milliseconds): " + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length); long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0]; long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0]; for (long l : AggregateReducerRunTimes) { if (l < AggregateReducerRunTimesMin) { AggregateReducerRunTimesMin = l; } if (l > AggregateReducerRunTimesMax) { AggregateReducerRunTimesMax = l; } } System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin); System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax); System.out.println( "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax)); System.out .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time)); System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): " + (AggregateReducerEndMax - FIMMapperStartMin)); System.out.println("total runtime (no setups, no cooldowns) (milliseconds): " + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin)); System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime); System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): " + (FIMReducerEndMax - FIMMapperStartMin)); System.out.println( "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime); System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): " + (AggregateReducerEndMax - AggregateMapperStartMin)); return 0; }
From source file:com.csiro.hadoop.WritableTest.java
public static void main(String[] args) { System.out.println("*** Primitive Writable ***"); BooleanWritable bool1 = new BooleanWritable(true); ByteWritable byte1 = new ByteWritable((byte) 3); System.out.printf("Boolean:%s Byte:%d\n", bool1, byte1.get()); IntWritable int1 = new IntWritable(5); IntWritable int2 = new IntWritable(17); System.out.printf("I1:%d I2:%d\n", int1.get(), int2.get()); int1.set(int2.get()); System.out.printf("I1:%d I2:%d\n", int1.get(), int2.get()); Integer int3 = new Integer(23); int1.set(int3); System.out.printf("I1:%d I2:%d\n", int1.get(), int2.get()); System.out.println("*** Array Writable ***"); ArrayWritable a = new ArrayWritable(IntWritable.class); a.set(new IntWritable[] { new IntWritable(1), new IntWritable(3), new IntWritable(5) }); IntWritable[] values = (IntWritable[]) a.get(); for (IntWritable i : values) { System.out.println(i);//from w ww . ja va 2 s . c o m } IntArrayWritable ia = new IntArrayWritable(); ia.set(new IntWritable[] { new IntWritable(1), new IntWritable(3), new IntWritable(5) }); IntWritable[] ivalues = (IntWritable[]) ia.get(); ia.set((new LongWritable[] { new LongWritable(10001) })); System.out.println("*** Map Writables ***"); MapWritable m = new MapWritable(); IntWritable key1 = new IntWritable(5); NullWritable value1 = NullWritable.get(); m.put(key1, value1); System.out.println(m.containsKey(key1)); System.out.println(m.get(key1)); m.put(new LongWritable(100000000), key1); Set<Writable> keys = m.keySet(); for (Writable k : keys) System.out.println(k.getClass()); }
From source file:com.dasasian.chok.lucene.LuceneServer.java
License:Apache License
@Override public MapWritable getDetails(final String[] shards, final int docId, final String[] fieldNames) throws IOException { final MapWritable result = new MapWritable(); final Document doc = doc(shards[0], docId, fieldNames); final List<Fieldable> fields = doc.getFields(); for (final Fieldable field : fields) { final String name = field.name(); if (field.isBinary()) { final byte[] binaryValue = field.getBinaryValue(); result.put(new Text(name), new BytesWritable(binaryValue)); } else {//w w w . j a va 2s . c o m final String stringValue = field.stringValue(); result.put(new Text(name), new Text(stringValue)); } } return result; }
From source file:com.davidgildeh.hadoop.input.simpledb.SimpleDBRecordReader.java
License:Apache License
/** * Get next Key/Value Record (Tuple) from the Split * //w w w . j a v a2s. com * @param key The key to set * @param value The HashMap value to set * @return True - next Item available, False - No more items available * @throws IOException */ public boolean next(Text key, MapWritable value) throws IOException { // Get next item off the ArrayList unless we're at the end if (cursor < split.getLength()) { Item item = items.get(cursor++); key.set(item.getName()); for (Attribute attribute : item.getAttributes()) { value.put(new Text(attribute.getName()), new Text(attribute.getValue())); } if (LOG.isDebugEnabled()) { LOG.debug("Sending next record to Mappers: " + key.toString()); } return true; } else { return false; } }
From source file:com.digitalpebble.behemoth.gate.GATEProcessor.java
License:Apache License
public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) { if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString()); boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false); // process the text passed as value with the application // a) create a GATE document based on the text value gate.Document gatedocument = null; try {//ww w. j a v a 2 s. co m gatedocument = generateGATEDoc(inputDoc); // add it to the current corpus corpus.add(gatedocument); // get the application and assign the corpus to it this.GATEapplication.setCorpus(corpus); // process it with GATE this.GATEapplication.execute(); AnnotationSet annots = null; if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations(); else annots = gatedocument.getAnnotations(filters.getAnnotationSetName()); // enrich the input doc with the annotations from // the GATE application // transfer the annotations from the GATE document // to the Behemoth one using the filters List<com.digitalpebble.behemoth.Annotation> beheannotations = convertGATEAnnotationsToBehemoth(annots, inputDoc); // sort the annotations before adding them? Collections.sort(beheannotations); // clear the existing behemoth annotations if (clearBehemothAnnotations) { inputDoc.getAnnotations().clear(); } inputDoc.getAnnotations().addAll(beheannotations); // add counters about num of annotations added if (reporter != null) for (com.digitalpebble.behemoth.Annotation annot : beheannotations) { reporter.incrCounter("GATE", annot.getType(), 1); } // Add the document features from GATE to Behemoth Set<String> docFeatFilter = this.filters.getDocFeaturesFilter(); MapWritable beheMD = inputDoc.getMetadata(true); if (docFeatFilter.size() > 0) { for (String docFeatName : docFeatFilter) { Object featValue = gatedocument.getFeatures().get(docFeatName); if (featValue != null) { beheMD.put(new Text(docFeatName), new Text(featValue.toString())); } } } if (reporter != null) reporter.incrCounter("GATE", "Document", 1); } catch (Exception e) { LOG.error(inputDoc.getUrl().toString(), e); if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1); } finally { // remove the document from the corpus again corpus.clear(); // and from memory if (gatedocument != null) Factory.deleteResource(gatedocument); } // currently returns only the input document return new BehemothDocument[] { inputDoc }; }
From source file:com.digitalpebble.behemoth.tika.TikaProcessor.java
License:Apache License
/** * Classes that wish to handle Metadata separately may override this method * //ww w . ja va2s. c o m * @param metadata * the extracted {@link org.apache.tika.metadata.Metadata} */ protected void processMetadata(BehemothDocument inputDoc, Metadata metadata) { MapWritable mapW = inputDoc.getMetadata(true); for (String name : metadata.names()) { String[] values = metadata.getValues(name); // temporary fix to avoid // Exception in thread "main" java.io.IOException: can't find class: // com.digitalpebble.behemoth.tika.TextArrayWritable because // com.digitalpebble.behemoth.tika.TextArrayWritable // at // org.apache.hadoop.io.AbstractMapWritable.readFields(AbstractMapWritable.java:204) // simply store multiple values as a , separated Text StringBuffer buff = new StringBuffer(); for (int i = 0; i < values.length; i++) { if (i > 0) buff.append(","); buff.append(values[i]); } // TODO prefix md? mapW.put(new Text(name), new Text(buff.toString())); // mapW.put(new Text(name), new TextArrayWritable(values)); } inputDoc.setMetadata(mapW); }
From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java
License:Apache License
@Test public void test_Aleph2EsRecordReader_objectConversion() throws IOException, InterruptedException { @SuppressWarnings("rawtypes") final RecordReader mock_shard_record_reader = Mockito.mock(RecordReader.class); // mock returns Text key, MapWritable value Mockito.when(mock_shard_record_reader.getCurrentKey()).thenReturn(new Text("text_test")); final MapWritable test_out = new MapWritable(); test_out.put(new Text("val_key_text"), new Text("val_val_text")); Mockito.when(mock_shard_record_reader.getCurrentValue()).thenReturn(test_out); final Aleph2EsRecordReader reader_under_test = new Aleph2EsRecordReader(mock_shard_record_reader); final String key = reader_under_test.getCurrentKey(); assertEquals(String.class, key.getClass()); assertEquals("text_test", key); final Tuple2<Long, IBatchRecord> value = reader_under_test.getCurrentValue(); assertEquals(0L, value._1().longValue()); // (so something breaks in here when/if we put some logic in) assertEquals(Optional.empty(), value._2().getContent()); final JsonNode json_val = value._2().getJson(); assertTrue("Is object: " + json_val, json_val.isObject()); assertEquals("val_val_text", json_val.get("val_key_text").asText()); assertEquals("text_test", json_val.get("_id").asText()); }
From source file:com.ikanow.aleph2.search_service.elasticsearch.utils.TestJsonNodeWritableUtils.java
License:Apache License
@SuppressWarnings("deprecation") @Test// www. j a v a2 s. c o m public void test_mapWritableWrapper() { final ObjectMapper mapper = BeanTemplateUtils.configureMapper(Optional.empty()); final MapWritable m1 = new MapWritable(); m1.put(new Text("test1"), new BooleanWritable(true)); final MapWritable m2 = new MapWritable(); m2.put(new Text("nested"), m1); m2.put(new Text("test2"), new Text("test2")); final ArrayWritable a1 = new ArrayWritable(IntWritable.class); a1.set(new Writable[] { new IntWritable(4), new IntWritable(5) }); final ArrayWritable a2 = new ArrayWritable(MapWritable.class); a2.set(new Writable[] { m1, m1 }); m2.put(new Text("array"), a2); m1.put(new Text("array"), a1); final JsonNode j2 = JsonNodeWritableUtils.from(m2); assertEquals(3, j2.size()); // Check j's contents assertEquals(Stream.of("nested", "test2", "array").sorted().collect(Collectors.toList()), Optionals.streamOf(j2.fieldNames(), false).sorted().collect(Collectors.toList())); assertEquals("test2", j2.get("test2").asText()); final JsonNode j1 = j2.get("nested"); assertEquals(2, j1.size()); final JsonNode j1b = JsonNodeWritableUtils.from(m1); assertTrue("{\"test1\":true,\"array\":[4,5]}".equals(j1b.toString()) || "{\"array\":[4,5],\"test1\":true}".equals(j1b.toString())); //(tests entrySet) final ArrayNode an = mapper.createArrayNode(); an.add(mapper.convertValue(4, JsonNode.class)); an.add(mapper.convertValue(5, JsonNode.class)); assertEquals(Arrays.asList(mapper.convertValue(true, JsonNode.class), an), Optionals.streamOf(((ObjectNode) j1).elements(), false).collect(Collectors.toList())); // OK, now test adding: assertEquals(2, j1.size()); final ObjectNode o1 = (ObjectNode) j1; o1.put("added", "added_this"); final ObjectNodeWrapper o1c = (ObjectNodeWrapper) o1; assertFalse(o1c.containsKey("not_present")); assertTrue(o1c.containsKey("added")); assertTrue(o1c.containsKey("test1")); assertEquals(Stream.of("test1", "array", "added").sorted().collect(Collectors.toList()), Optionals.streamOf(j1.fieldNames(), false).sorted().collect(Collectors.toList())); assertEquals( Arrays.asList(mapper.convertValue(true, JsonNode.class), an, mapper.convertValue("added_this", JsonNode.class)), Optionals.streamOf(((ObjectNode) j1).elements(), false).collect(Collectors.toList())); assertTrue(j1.toString().contains("added_this")); assertTrue(j1.toString().contains("4,5")); assertEquals(mapper.convertValue("added_this", JsonNode.class), j1.get("added")); assertEquals(3, j1.size()); // OK now test removing: assertEquals(null, o1.remove("not_present")); assertEquals(mapper.convertValue(true, JsonNode.class), o1.remove("test1")); assertEquals(2, o1.size()); ObjectNode o1b = o1.remove(Arrays.asList("added", "array")); assertEquals(0, o1.size()); assertEquals(0, o1b.size()); o1.putAll(JsonNodeWritableUtils.from(m1)); // will be minus one object assertEquals(2, o1.size()); assertTrue(o1c.containsValue(mapper.convertValue(true, JsonNode.class))); assertFalse(o1c.containsValue("banana")); final ObjectNodeWrapper o2 = (ObjectNodeWrapper) JsonNodeWritableUtils.from(m2); assertFalse(o2.isEmpty()); assertTrue(o2.containsKey("array")); assertFalse(o2.containsValue("array")); assertTrue(o2.containsValue(mapper.convertValue("test2", JsonNode.class))); assertEquals(TextNode.class, o2.remove("test2").getClass()); assertEquals(2, o2.size()); o2.removeAll(); assertEquals(0, o2.size()); }
From source file:com.jfolson.hive.serde.RTypedBytesWritableInput.java
License:Apache License
public MapWritable readMap(MapWritable mw) throws IOException { if (mw == null) { mw = new MapWritable(); }//from w w w .ja v a 2 s .com int length = in.readMapHeader(); for (int i = 0; i < length; i++) { Writable key = read(); Writable value = read(); mw.put(key, value); } return mw; }
From source file:com.redgate.hadoop.hive.azuretables.AzureTablesRecordReader.java
License:Apache License
/** * Grabs the next result and process the DynamicTableEntity into a Hive * friendly MapWriteable//from w w w .jav a 2 s. co m * * @param key * The RowID for the entity. Not that this is not really an Azure * key, since the partition is implicit in the key * @param value * A MapWriteable which will be populated with values from the * DynamicTableEntity returned by the Azure query. */ public boolean next(Text key, MapWritable value) throws IOException { if (!results.hasNext()) return false; DynamicTableEntity entity = results.next(); key.set(entity.getRowKey()); for (Entry<String, EntityProperty> entry : entity.getProperties().entrySet()) { final EntityProperty property = entry.getValue(); // Note that azure table entity keys are forced to lower case for // matching with hive column names final String propertyKey = entry.getKey().toLowerCase(); final String propertyValue = property.getValueAsString(); final Writable writableValue = SERIALIZED_NULL.equals(propertyValue) ? NullWritable.get() : new Text(propertyValue); value.put(new Text(propertyKey), writableValue); } pos++; return true; }