List of usage examples for org.apache.hadoop.io DoubleWritable DoubleWritable
public DoubleWritable(double value)
From source file:org.apache.mahout.math.hadoop.similarity.VectorDistanceMapper.java
License:Apache License
@Override protected void map(WritableComparable<?> key, VectorWritable value, Context context) throws IOException, InterruptedException { String keyName;// ww w .j a v a 2 s .c o m Vector valVec = value.get(); if (valVec instanceof NamedVector) { keyName = ((NamedVector) valVec).getName(); } else { keyName = key.toString(); } for (NamedVector seedVector : seedVectors) { double distance = measure.distance(seedVector, valVec); if (!usesThreshold || distance <= maxDistance) { StringTuple outKey = new StringTuple(); outKey.add(seedVector.getName()); outKey.add(keyName); context.write(outKey, new DoubleWritable(distance)); } } }
From source file:org.apache.mahout.math.hadoop.stats.BasicStatsTest.java
License:Apache License
private void produceTestData(Path input) throws Exception { FileSystem fs = FileSystem.get(input.toUri(), conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, input, IntWritable.class, DoubleWritable.class); //Random random = new MersenneTwisterRNG(); /*Normal normal = new Normal(5, 3, random); for (int i = 0; i < 10000; i++) { writer.append(new IntWritable(i), new DoubleWritable((long)normal.nextDouble())); }*//* w ww. ja va 2s. c o m*/ int i = 0; writer.append(new IntWritable(i++), new DoubleWritable(7)); writer.append(new IntWritable(i++), new DoubleWritable(9)); writer.append(new IntWritable(i++), new DoubleWritable(9)); writer.append(new IntWritable(i++), new DoubleWritable(10)); writer.append(new IntWritable(i++), new DoubleWritable(10)); writer.append(new IntWritable(i++), new DoubleWritable(10)); writer.append(new IntWritable(i++), new DoubleWritable(10)); writer.append(new IntWritable(i++), new DoubleWritable(11)); writer.append(new IntWritable(i++), new DoubleWritable(11)); writer.append(new IntWritable(i++), new DoubleWritable(13)); writer.close(); }
From source file:org.apache.mahout.math.hadoop.stats.BasicStatsTest.java
License:Apache License
@Test public void testStdDev2() throws Exception { Path input = getTestTempFilePath("stdDev/counts.file"); Path output = getTestTempFilePath("stdDev/output.file"); FileSystem fs = FileSystem.get(input.toUri(), conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, input, IntWritable.class, DoubleWritable.class); Random random = RandomUtils.getRandom(); Normal normal = new Normal(5, 3, random); for (int i = 0; i < 1000000; i++) { writer.append(new IntWritable(i), new DoubleWritable((long) normal.nextInt())); }/*from ww w .j a v a 2 s . c o m*/ writer.close(); double v = BasicStats.stdDev(input, output, conf); assertEquals(3, v, 0.02); }
From source file:org.apache.mahout.math.hadoop.stats.StandardDeviationCalculatorMapper.java
License:Apache License
@Override protected void map(IntWritable key, Writable value, Context context) throws IOException, InterruptedException { if (key.get() == -1) { return;//from w ww .j a va2 s .co m } //Kind of ugly, but such is life double df = Double.NaN; if (value instanceof LongWritable) { df = ((LongWritable) value).get(); } else if (value instanceof DoubleWritable) { df = ((DoubleWritable) value).get(); } if (!Double.isNaN(df)) { // For calculating the sum of squares context.write(SUM_OF_SQUARES, new DoubleWritable(df * df)); context.write(SUM, new DoubleWritable(df)); // For calculating the total number of entries context.write(TOTAL_COUNT, new DoubleWritable(1)); } }
From source file:org.apache.mahout.math.hadoop.stats.StandardDeviationCalculatorReducer.java
License:Apache License
@Override protected void reduce(IntWritable key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { double sum = 0.0; for (DoubleWritable value : values) { sum += value.get();//from w ww .ja v a 2 s . co m } context.write(key, new DoubleWritable(sum)); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.java
License:Apache License
/** * Perform LLR calculation, input is: k:ngram:ngramFreq v:(h_|t_)subgram:subgramfreq N = ngram total * /*from w w w.j a v a 2 s . co m*/ * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B respectively below. * * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times A appears without B: * hSubgramFreq - ngramFreq !A+ B: number of times B appears without A: tSubgramFreq - ngramFreq !A+!B: * number of times neither A or B appears (in that order): N - (subgramFreqA + subgramFreqB - ngramFreq) */ @Override protected void reduce(Gram ngram, Iterable<Gram> values, Context context) throws IOException, InterruptedException { int[] gramFreq = { -1, -1 }; if (ngram.getType() == Gram.Type.UNIGRAM && emitUnigrams) { DoubleWritable dd = new DoubleWritable(ngram.getFrequency()); Text t = new Text(ngram.getString()); context.write(t, dd); return; } // FIXME: better way to handle errors? Wouldn't an exception thrown here // cause hadoop to re-try the job? String[] gram = new String[2]; for (Gram value : values) { int pos = value.getType() == Gram.Type.HEAD ? 0 : 1; if (gramFreq[pos] != -1) { log.warn("Extra {} for {}, skipping", value.getType(), ngram); if (value.getType() == Gram.Type.HEAD) { context.getCounter(Skipped.EXTRA_HEAD).increment(1); } else { context.getCounter(Skipped.EXTRA_TAIL).increment(1); } return; } gram[pos] = value.getString(); gramFreq[pos] = value.getFrequency(); } if (gramFreq[0] == -1) { log.warn("Missing head for {}, skipping.", ngram); context.getCounter(Skipped.MISSING_HEAD).increment(1); return; } else if (gramFreq[1] == -1) { log.warn("Missing tail for {}, skipping", ngram); context.getCounter(Skipped.MISSING_TAIL).increment(1); return; } int k11 = ngram.getFrequency(); /* a&b */ int k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */ int k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */ int k22 = (int) (ngramTotal - (gramFreq[0] + gramFreq[1] - ngram.getFrequency())); /* !a&!b */ try { double llr = ll.logLikelihoodRatio(k11, k12, k21, k22); if (llr < minLLRValue) { context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1); return; } DoubleWritable dd = new DoubleWritable(llr); Text t = new Text(ngram.getString()); context.write(t, dd); } catch (IllegalArgumentException ex) { context.getCounter(Skipped.LLR_CALCULATION_ERROR).increment(1); log.error("Problem calculating LLR ratio: " + ex.getMessage()); log.error("NGram: " + ngram); log.error("HEAD: " + gram[0] + ':' + gramFreq[0]); log.error("TAIL: " + gram[1] + ':' + gramFreq[1]); log.error("k11: " + k11 + " k12: " + k12 + " k21: " + k21 + " k22: " + k22); } }
From source file:org.apache.mahout.vectorizer.collocations.llr.LLRReducer.java
License:Apache License
/** * Perform LLR calculation, input is: k:ngram:ngramFreq v:(h_|t_)subgram:subgramfreq N = ngram total * /* w w w.j av a2s.com*/ * Each ngram will have 2 subgrams, a head and a tail, referred to as A and B respectively below. * * A+ B: number of times a+b appear together: ngramFreq A+!B: number of times A appears without B: * hSubgramFreq - ngramFreq !A+ B: number of times B appears without A: tSubgramFreq - ngramFreq !A+!B: * number of times neither A or B appears (in that order): N - (subgramFreqA + subgramFreqB - ngramFreq) */ @Override protected void reduce(Gram ngram, Iterable<Gram> values, Context context) throws IOException, InterruptedException { int[] gramFreq = { -1, -1 }; if (ngram.getType() == Gram.Type.UNIGRAM && emitUnigrams) { DoubleWritable dd = new DoubleWritable(ngram.getFrequency()); Text t = new Text(ngram.getString()); context.write(t, dd); return; } // TODO better way to handle errors? Wouldn't an exception thrown here // cause hadoop to re-try the job? String[] gram = new String[2]; for (Gram value : values) { int pos = value.getType() == Gram.Type.HEAD ? 0 : 1; if (gramFreq[pos] != -1) { log.warn("Extra {} for {}, skipping", value.getType(), ngram); if (value.getType() == Gram.Type.HEAD) { context.getCounter(Skipped.EXTRA_HEAD).increment(1); } else { context.getCounter(Skipped.EXTRA_TAIL).increment(1); } return; } gram[pos] = value.getString(); gramFreq[pos] = value.getFrequency(); } if (gramFreq[0] == -1) { log.warn("Missing head for {}, skipping.", ngram); context.getCounter(Skipped.MISSING_HEAD).increment(1); return; } if (gramFreq[1] == -1) { log.warn("Missing tail for {}, skipping", ngram); context.getCounter(Skipped.MISSING_TAIL).increment(1); return; } long k11 = ngram.getFrequency(); /* a&b */ long k12 = gramFreq[0] - ngram.getFrequency(); /* a&!b */ long k21 = gramFreq[1] - ngram.getFrequency(); /* !b&a */ long k22 = ngramTotal - (gramFreq[0] + gramFreq[1] - ngram.getFrequency()); /* !a&!b */ double llr; try { llr = ll.logLikelihoodRatio(k11, k12, k21, k22); } catch (IllegalArgumentException ex) { context.getCounter(Skipped.LLR_CALCULATION_ERROR).increment(1); log.warn( "Problem calculating LLR ratio for ngram {}, HEAD {}:{}, TAIL {}:{}, k11/k12/k21/k22: {}/{}/{}/{}", ngram, gram[0], gramFreq[0], gram[1], gramFreq[1], k11, k12, k21, k22, ex); return; } if (llr < minLLRValue) { context.getCounter(Skipped.LESS_THAN_MIN_LLR).increment(1); } else { context.write(new Text(ngram.getString()), new DoubleWritable(llr)); } }
From source file:org.apache.nifi.processors.hive.TestConvertAvroToORC.java
License:Apache License
@Test public void test_onTrigger_nested_complex_record() throws Exception { Map<String, List<Double>> mapData1 = new TreeMap<String, List<Double>>() { {//from w ww.j av a2s .c o m put("key1", Arrays.asList(1.0, 2.0)); put("key2", Arrays.asList(3.0, 4.0)); } }; Map<String, String> arrayMap11 = new TreeMap<String, String>() { { put("key1", "v1"); put("key2", "v2"); } }; Map<String, String> arrayMap12 = new TreeMap<String, String>() { { put("key3", "v3"); put("key4", "v4"); } }; GenericData.Record record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData1, Arrays.asList(arrayMap11, arrayMap12)); DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema()); DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer); ByteArrayOutputStream out = new ByteArrayOutputStream(); fileWriter.create(record.getSchema(), out); fileWriter.append(record); // Put another record in Map<String, List<Double>> mapData2 = new TreeMap<String, List<Double>>() { { put("key1", Arrays.asList(-1.0, -2.0)); put("key2", Arrays.asList(-3.0, -4.0)); } }; Map<String, String> arrayMap21 = new TreeMap<String, String>() { { put("key1", "v-1"); put("key2", "v-2"); } }; Map<String, String> arrayMap22 = new TreeMap<String, String>() { { put("key3", "v-3"); put("key4", "v-4"); } }; record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData2, Arrays.asList(arrayMap21, arrayMap22)); fileWriter.append(record); fileWriter.flush(); fileWriter.close(); out.close(); Map<String, String> attributes = new HashMap<String, String>() { { put(CoreAttributes.FILENAME.key(), "test"); } }; runner.enqueue(out.toByteArray(), attributes); runner.run(); runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1); // Write the flow file out to disk, since the ORC Reader needs a path MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0); assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS nested_complex_record " + "(myMapOfArray MAP<STRING, ARRAY<DOUBLE>>, myArrayOfMap ARRAY<MAP<STRING, STRING>>)" + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE)); assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE)); assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key())); byte[] resultContents = runner.getContentAsByteArray(resultFlowFile); FileOutputStream fos = new FileOutputStream("target/test1.orc"); fos.write(resultContents); fos.flush(); fos.close(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); Object o = rows.next(null); assertNotNull(o); assertTrue(o instanceof OrcStruct); TypeInfo resultSchema = TestNiFiOrcUtils.buildNestedComplexOrcSchema(); StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema); // check values Object myMapOfArray = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMapOfArray")); assertTrue(myMapOfArray instanceof Map); Map map = (Map) myMapOfArray; Object mapValue = map.get(new Text("key1")); assertNotNull(mapValue); assertTrue(mapValue instanceof List); assertEquals(Arrays.asList(new DoubleWritable(1.0), new DoubleWritable(2.0)), mapValue); Object myArrayOfMap = inspector.getStructFieldData(o, inspector.getStructFieldRef("myArrayOfMap")); assertTrue(myArrayOfMap instanceof List); List list = (List) myArrayOfMap; Object el0 = list.get(0); assertNotNull(el0); assertTrue(el0 instanceof Map); assertEquals(new Text("v1"), ((Map) el0).get(new Text("key1"))); }
From source file:org.apache.orc.mapred.TestOrcFileEvolution.java
License:Apache License
private WritableComparable assembleRecord(TypeDescription type, Object row) { if (row == null) { return null; }//from ww w . ja v a 2 s . c o m switch (type.getCategory()) { case STRUCT: OrcStruct structResult = new OrcStruct(type); for (int i = 0; i < structResult.getNumFields(); i++) { List<TypeDescription> childTypes = type.getChildren(); structResult.setFieldValue(i, assembleRecord(childTypes.get(i), ((List<Object>) row).get(i))); } return structResult; case LIST: OrcList<WritableComparable> listResult = new OrcList<>(type); TypeDescription elemType = type.getChildren().get(0); List<Object> elems = (List<Object>) row; for (int i = 0; i < elems.size(); i++) { listResult.add(assembleRecord(elemType, elems.get(i))); } return listResult; case MAP: OrcMap<WritableComparable, WritableComparable> mapResult = new OrcMap<>(type); TypeDescription keyType = type.getChildren().get(0); TypeDescription valueType = type.getChildren().get(1); for (Map.Entry<Object, Object> entry : ((Map<Object, Object>) row).entrySet()) { mapResult.put(assembleRecord(keyType, entry.getKey()), assembleRecord(valueType, entry.getValue())); } return mapResult; case INT: return new IntWritable((Integer) row); case DOUBLE: return new DoubleWritable((Double) row); case STRING: return new Text((String) row); default: throw new UnsupportedOperationException( String.format("Not expecting to have a field of type %s in unit tests", type.getCategory())); } }
From source file:org.apache.orc.mapred.TestOrcOutputFormat.java
License:Apache License
@Test public void testAllTypes() throws Exception { conf.set("mapreduce.task.attempt.id", "attempt_20160101_0001_m_000001_0"); conf.setOutputCommitter(NullOutputCommitter.class); final String typeStr = "struct<b1:binary,b2:boolean,b3:tinyint," + "c:char(10),d1:date,d2:decimal(20,5),d3:double,fff:float,int:int," + "l:array<bigint>,map:map<smallint,string>," + "str:struct<u:uniontype<timestamp,varchar(100)>>,ts:timestamp>"; OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr); FileOutputFormat.setOutputPath(conf, workDir); TypeDescription type = TypeDescription.fromString(typeStr); // build a row object OrcStruct row = (OrcStruct) OrcStruct.createValue(type); ((BytesWritable) row.getFieldValue(0)).set(new byte[] { 1, 2, 3, 4 }, 0, 4); ((BooleanWritable) row.getFieldValue(1)).set(true); ((ByteWritable) row.getFieldValue(2)).set((byte) 23); ((Text) row.getFieldValue(3)).set("aaabbbcccddd"); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); ((DateWritable) row.getFieldValue(4)).set(DateWritable.millisToDays(format.parse("2016-04-01").getTime())); ((HiveDecimalWritable) row.getFieldValue(5)).set(new HiveDecimalWritable("1.23")); ((DoubleWritable) row.getFieldValue(6)).set(1.5); ((FloatWritable) row.getFieldValue(7)).set(4.5f); ((IntWritable) row.getFieldValue(8)).set(31415); OrcList<LongWritable> longList = (OrcList<LongWritable>) row.getFieldValue(9); longList.add(new LongWritable(123)); longList.add(new LongWritable(456)); OrcMap<ShortWritable, Text> map = (OrcMap<ShortWritable, Text>) row.getFieldValue(10); map.put(new ShortWritable((short) 1000), new Text("aaaa")); map.put(new ShortWritable((short) 123), new Text("bbbb")); OrcStruct struct = (OrcStruct) row.getFieldValue(11); OrcUnion union = (OrcUnion) struct.getFieldValue(0); union.set((byte) 1, new Text("abcde")); ((OrcTimestamp) row.getFieldValue(12)).set("1996-12-11 15:00:00"); NullWritable nada = NullWritable.get(); RecordWriter<NullWritable, OrcStruct> writer = new OrcOutputFormat<OrcStruct>().getRecordWriter(fs, conf, "all.orc", Reporter.NULL); for (int r = 0; r < 10; ++r) { row.setFieldValue(8, new IntWritable(r * 10)); writer.write(nada, row);//from w w w .j ava2s. c om } union.set((byte) 0, new OrcTimestamp("2011-12-25 12:34:56")); for (int r = 0; r < 10; ++r) { row.setFieldValue(8, new IntWritable(r * 10 + 100)); writer.write(nada, row); } OrcStruct row2 = new OrcStruct(type); writer.write(nada, row2); row.setFieldValue(8, new IntWritable(210)); writer.write(nada, row); writer.close(Reporter.NULL); FileSplit split = new FileSplit(new Path(workDir, "all.orc"), 0, 100000, new String[0]); RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().getRecordReader(split, conf, Reporter.NULL); nada = reader.createKey(); row = reader.createValue(); for (int r = 0; r < 22; ++r) { assertEquals(true, reader.next(nada, row)); if (r == 20) { for (int c = 0; c < 12; ++c) { assertEquals(null, row.getFieldValue(c)); } } else { assertEquals(new BytesWritable(new byte[] { 1, 2, 3, 4 }), row.getFieldValue(0)); assertEquals(new BooleanWritable(true), row.getFieldValue(1)); assertEquals(new ByteWritable((byte) 23), row.getFieldValue(2)); assertEquals(new Text("aaabbbcccd"), row.getFieldValue(3)); assertEquals(new DateWritable(DateWritable.millisToDays(format.parse("2016-04-01").getTime())), row.getFieldValue(4)); assertEquals(new HiveDecimalWritable("1.23"), row.getFieldValue(5)); assertEquals(new DoubleWritable(1.5), row.getFieldValue(6)); assertEquals(new FloatWritable(4.5f), row.getFieldValue(7)); assertEquals(new IntWritable(r * 10), row.getFieldValue(8)); assertEquals(longList, row.getFieldValue(9)); assertEquals(map, row.getFieldValue(10)); if (r < 10) { union.set((byte) 1, new Text("abcde")); } else { union.set((byte) 0, new OrcTimestamp("2011-12-25 12:34:56")); } assertEquals("row " + r, struct, row.getFieldValue(11)); assertEquals("row " + r, new OrcTimestamp("1996-12-11 15:00:00"), row.getFieldValue(12)); } } assertEquals(false, reader.next(nada, row)); }