List of usage examples for org.apache.hadoop.io IntWritable IntWritable
public IntWritable()
From source file:com.yahoo.glimmer.indexing.generator.IndexRecordWriterTest.java
License:Open Source License
@Test public void test() throws Exception { context.checking(new Expectations() { {// w ww . jav a2s .c om allowing(taskContext).getConfiguration(); will(returnValue(conf)); allowing(taskContext).getTaskAttemptID(); will(returnValue(taskAttemptID)); } }); OutputFormat outputFormat = new IndexRecordWriter.OutputFormat(); conf.setStrings("RdfFieldNames", "index0", "index1"); conf.setEnum("IndexType", RDFDocumentFactory.IndexType.VERTICAL); RecordWriter<IntWritable, IndexRecordWriterValue> recordWriter = outputFormat.getRecordWriter(taskContext); IntWritable key = new IntWritable(); IndexRecordWriterTermValue termValue = new IndexRecordWriterTermValue(); IndexRecordWriterDocValue docValue = new IndexRecordWriterDocValue(); IndexRecordWriterSizeValue sizeValue = new IndexRecordWriterSizeValue(); // ALIGNEMENT_INDEX key.set(DocumentMapper.ALIGNMENT_INDEX); termValue.setTerm("term1"); termValue.setTermFrequency(1); // The alignment index doesn't have positions/counts. termValue.setOccurrenceCount(0); termValue.setSumOfMaxTermPositions(0); recordWriter.write(key, termValue); docValue.setDocument(0); // term1 occurs in index 0 recordWriter.write(key, docValue); // Index 0 key.set(0); termValue.setTermFrequency(3); termValue.setOccurrenceCount(6); termValue.setSumOfMaxTermPositions(15 + 12 + 18); recordWriter.write(key, termValue); docValue.setDocument(3); docValue.clearOccerrences(); docValue.addOccurrence(11); docValue.addOccurrence(15); recordWriter.write(key, docValue); docValue.setDocument(4); docValue.clearOccerrences(); docValue.addOccurrence(12); recordWriter.write(key, docValue); docValue.setDocument(7); docValue.clearOccerrences(); docValue.addOccurrence(14); docValue.addOccurrence(17); docValue.addOccurrence(18); recordWriter.write(key, docValue); // ALIGNEMENT_INDEX key.set(DocumentMapper.ALIGNMENT_INDEX); termValue.setTerm("term2"); termValue.setTermFrequency(2); // The alignment index doesn't have positions/counts. termValue.setOccurrenceCount(0); termValue.setSumOfMaxTermPositions(0); recordWriter.write(key, termValue); docValue.clearOccerrences(); docValue.setDocument(0); // term2 occurs in index 0 & 1 recordWriter.write(key, docValue); docValue.setDocument(1); // term2 occurs in index 0 & 1 recordWriter.write(key, docValue); // Index 0 key.set(0); termValue.setTermFrequency(2); termValue.setOccurrenceCount(4); termValue.setSumOfMaxTermPositions(19 + 16); recordWriter.write(key, termValue); docValue.setDocument(1); docValue.clearOccerrences(); docValue.addOccurrence(10); docValue.addOccurrence(19); recordWriter.write(key, docValue); docValue.setDocument(7); docValue.clearOccerrences(); docValue.addOccurrence(13); docValue.addOccurrence(16); recordWriter.write(key, docValue); // Index 1 key.set(1); termValue.setTermFrequency(1); termValue.setOccurrenceCount(1); termValue.setSumOfMaxTermPositions(14); recordWriter.write(key, termValue); docValue.setDocument(1); docValue.clearOccerrences(); docValue.addOccurrence(14); recordWriter.write(key, docValue); // ALIGNMENT_INDEX key.set(DocumentMapper.ALIGNMENT_INDEX); termValue.setTerm("term3"); termValue.setTermFrequency(1); // The alignment index doesn't have positions/counts. termValue.setOccurrenceCount(0); termValue.setSumOfMaxTermPositions(0); recordWriter.write(key, termValue); docValue.setDocument(1); // term3 occurs in index 1 recordWriter.write(key, docValue); docValue.clearOccerrences(); // Index 1 key.set(1); termValue.setTermFrequency(1); termValue.setOccurrenceCount(2); termValue.setSumOfMaxTermPositions(11); recordWriter.write(key, termValue); docValue.setDocument(3); docValue.clearOccerrences(); docValue.addOccurrence(10); docValue.addOccurrence(11); recordWriter.write(key, docValue); // Doc Sizes. key.set(0); sizeValue.setDocument(0); sizeValue.setSize(3); recordWriter.write(key, sizeValue); sizeValue.setDocument(3); sizeValue.setSize(1); recordWriter.write(key, sizeValue); sizeValue.setDocument(4); sizeValue.setSize(10); recordWriter.write(key, sizeValue); sizeValue.setDocument(6); sizeValue.setSize(2); recordWriter.write(key, sizeValue); key.set(1); sizeValue.setDocument(3); sizeValue.setSize(3); recordWriter.write(key, sizeValue); sizeValue.setDocument(6); sizeValue.setSize(5); recordWriter.write(key, sizeValue); recordWriter.close(taskContext); // Check the written indexes.. Path workPath = outputFormat.getDefaultWorkFile(taskContext, ""); System.out.println("Default work file is " + workPath.toString()); String dir = workPath.toUri().getPath(); BitStreamIndex index0 = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/index0", true, true); assertEquals(8, index0.numberOfDocuments); assertEquals(2, index0.numberOfTerms); assertTrue(index0.hasPositions); // term1 checkOccurrences(index0.documents(0), 3, "(3:11,15) (4:12) (7:14,17,18)"); // term2 checkOccurrences(index0.documents(1), 2, "(1:10,19) (7:13,16)"); assertEquals("[3, 0, 0, 1, 10, 0, 2, 0]", index0.sizes.toString()); BitStreamIndex index1 = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/index1", true, true); assertEquals(8, index1.numberOfDocuments); assertEquals(2, index1.numberOfTerms); assertTrue(index0.hasPositions); checkOccurrences(index1.documents(0), 1, "(1:14)"); // term3 checkOccurrences(index1.documents(1), 1, "(3:10,11)"); BitStreamIndex indexAlignment = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/alignment", true); assertEquals(8, indexAlignment.numberOfDocuments); assertEquals(3, indexAlignment.numberOfTerms); assertFalse(indexAlignment.hasPositions); // term1 assertEquals(1, indexAlignment.documents(0).frequency()); // term2 assertEquals(2, indexAlignment.documents(1).frequency()); // term3 assertEquals(1, indexAlignment.documents(2).frequency()); assertEquals("[0, 0, 0, 3, 0, 0, 5, 0]", index1.sizes.toString()); }
From source file:com.yahoo.glimmer.indexing.generator.TermReduce.java
License:Open Source License
@Override protected void setup( org.apache.hadoop.mapreduce.Reducer<TermKey, TermValue, IntWritable, IndexRecordWriterValue>.Context context) throws IOException, InterruptedException { writerKey = new IntWritable(); writerTermValue = new IndexRecordWriterTermValue(); writerDocValue = new IndexRecordWriterDocValue(); writerSizeValue = new IndexRecordWriterSizeValue(); predicatedIds = new ArrayList<Long>(); }
From source file:com.yahoo.semsearch.fastlinking.io.Datapack.java
License:Apache License
private void merge(String anchorMapPath, String dfMapPath, String multiple_out, String out, String ngram) throws IOException { JobConf conf = new JobConf(getConf(), Datapack.class); FileSystem fs = FileSystem.get(conf); BufferedWriter anchorsDataOut; BufferedWriter anchorsTSVOut; Boolean multiple_output = (multiple_out != null && multiple_out.equalsIgnoreCase("true")); Boolean ngram_output = (ngram != null && ngram.equalsIgnoreCase("true")); if (!multiple_output) { anchorsDataOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), outputEncoding)); anchorsTSVOut = null;//from w w w . j av a 2s . c o m } else { anchorsDataOut = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(out + ".dat"), outputEncoding)); anchorsTSVOut = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(out + ".tsv"), outputEncoding)); } // Loop over anchors MapFile.Reader anchorMapReader = new MapFile.Reader(new Path(anchorMapPath + "/part-00000"), conf); MapFile.Reader dfMapReader = new MapFile.Reader(new Path(dfMapPath + "/part-00000"), conf); /*FileStatus[] status = fs.listStatus( new Path( dfMapPath ) ); // you need to pass in your hdfs path for( FileStatus fileStatus : status ) { if( !fileStatus.getPath().toString().contains( "part-" )) continue; MapFile.Reader dfMapReader = new MapFile.Reader( fileStatus.getPath(), conf ); */ Text akey = new Text(); Text dkey = new Text(); IntWritable df = new IntWritable(); HMapSIW map = new HMapSIW(); while (anchorMapReader.next(akey, map)) { // since they are both sorted we can just iterate over both // TODO if need be, artificially add a 0 count to unseen anchors dfMapReader.next(dkey, df); while (!akey.toString().equalsIgnoreCase(dkey.toString())) { //System.err.println("Mismatch: '" + akey + "' and '" + dkey + "'"); anchorMapReader.next(akey, map); } String l = akey.toString(); // while( dfMapReader.next( dkey, df ) ) { // String l = dkey.toString(); if (l.trim().length() < 2) continue; StringBuilder targets = new StringBuilder(); int total = 0; for (String target : map.keySet()) { int count = map.get(target); total += count; String entity = URLEncoder.encode(target.replaceAll(" ", "_"), "UTF-8"); targets.append(entity); targets.append(SEPARATOR); targets.append(Integer.toString(count)); targets.append("\t"); } if (StringUtils.isNumeric(l) && total < 2) continue; //System.err.println("targets " + targets); if (targets.length() < 2) continue; if (!ngram_output) { anchorsDataOut.write(l); anchorsDataOut.write(SEPARATOR); anchorsDataOut.write(Integer.toString(df.get())); anchorsDataOut.write(SEPARATOR); anchorsDataOut.write(Integer.toString(total)); anchorsDataOut.write("\t"); anchorsDataOut.write(targets.substring(0, targets.length() - 1)); anchorsDataOut.write("\n"); anchorsDataOut.flush(); if (multiple_output) { for (String target : map.keySet()) { int count = map.get(target); String entity = URLEncoder.encode(target.replaceAll(" ", "_"), "UTF-8"); anchorsTSVOut.write(l); anchorsTSVOut.write("\t"); anchorsTSVOut.write(Integer.toString(df.get())); anchorsTSVOut.write("\t"); anchorsTSVOut.write(Integer.toString(total)); anchorsTSVOut.write("\t"); anchorsTSVOut.write(entity); anchorsTSVOut.write("\t"); anchorsTSVOut.write(Integer.toString(count)); anchorsTSVOut.write("\n"); anchorsTSVOut.flush(); } } } else { String parts[] = l.split("\\s+"); for (int i = 0; i < parts.length; i++) { StringBuilder sb = new StringBuilder(); for (int j = i; j < parts.length; j++) { sb.append(parts[j]); String ss = sb.toString(); anchorsDataOut.write(ss); anchorsDataOut.write(SEPARATOR); anchorsDataOut.write(Integer.toString(df.get())); anchorsDataOut.write(SEPARATOR); anchorsDataOut.write(Integer.toString(total)); anchorsDataOut.write("\t"); anchorsDataOut.write(targets.substring(0, targets.length() - 1)); anchorsDataOut.write("\n"); anchorsDataOut.flush(); if (multiple_output) { for (String target : map.keySet()) { int count = map.get(target); String entity = URLEncoder.encode(target.replaceAll(" ", "_"), "UTF-8"); anchorsTSVOut.write(ss); anchorsTSVOut.write("\t"); anchorsTSVOut.write(Integer.toString(df.get())); anchorsTSVOut.write("\t"); anchorsTSVOut.write(Integer.toString(total)); anchorsTSVOut.write("\t"); anchorsTSVOut.write(entity); anchorsTSVOut.write("\t"); anchorsTSVOut.write(Integer.toString(count)); anchorsTSVOut.write("\n"); anchorsTSVOut.flush(); } sb.append(" "); } } } } } dfMapReader.close(); //} anchorsDataOut.close(); if (multiple_output) { anchorsTSVOut.close(); } //anchorMapReader.close(); fs.close(); }
From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java
License:Apache License
private void merge(String anchorMapPath, String dfMapPath) throws IOException { LOG.info("Extracting anchor text (merge)..."); LOG.info(" - input: " + anchorMapPath); LOG.info(" - output: " + dfMapPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); FileSystem fs = FileSystem.get(conf); // Loop over anchors MapFile.Reader anchorMapReader = new MapFile.Reader(new Path(anchorMapPath + "/part-00000"), conf); MapFile.Reader dfMapReader = new MapFile.Reader(new Path(dfMapPath + "/part-00000"), conf); // IntWritable key = new IntWritable(Integer.parseInt(cmdline.getArgs()[0])); // System.out.println(key.toString()); Text key = new Text(); IntWritable df = new IntWritable(); while (dfMapReader.next(key, df)) { //if (!key.toString().equalsIgnoreCase("Jim Durham")) // continue; HMapSIW map = new HMapSIW(); anchorMapReader.get(key, map);// www. ja v a 2s .co m System.out.println(key + "\t" + df + "\t" + map.toString()); // for (String entity : map.keySet()) { // System.out.println("\t" + entity + "\t" + map.get(entity) + "\n"); // } break; } anchorMapReader.close(); dfMapReader.close(); fs.close(); }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws IOException { String uri = args[0];/*ww w .j a v a 2 s. c om*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); IntWritable key = new IntWritable(); Text value = new Text(); MapFile.Writer writer = null; try { writer = new MapFile.Writer(conf, fs, uri, key.getClass(), value.getClass()); for (int i = 0; i < 1024; i++) { key.set(i + 1); value.set(DATA[i % DATA.length]); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws IOException { String uri = args[0];//from w w w . j a va 2 s . c om Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); Path path = new Path(uri); IntWritable key = new IntWritable(); Text value = new Text(); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass()); for (int i = 0; i < 100; i++) { key.set(100 - i); value.set(DATA[i % DATA.length]); System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void walkthroughWithNoArgsConstructor() throws IOException { // vv IntWritableTest IntWritable writable = new IntWritable(); writable.set(163);/*from www . java 2s . c o m*/ // ^^ IntWritableTest checkWalkthrough(writable); }
From source file:crunch.MaxTemperature.java
License:Apache License
private void checkWalkthrough(IntWritable writable) throws IOException { // vv IntWritableTest-SerializedLength byte[] bytes = serialize(writable); assertThat(bytes.length, is(4)); // ^^ IntWritableTest-SerializedLength // vv IntWritableTest-SerializedBytes assertThat(StringUtils.byteToHexString(bytes), is("000000a3")); // ^^ IntWritableTest-SerializedBytes // vv IntWritableTest-Deserialization IntWritable newWritable = new IntWritable(); deserialize(newWritable, bytes); assertThat(newWritable.get(), is(163)); // ^^ IntWritableTest-Deserialization }// w w w. j ava2 s . co m
From source file:DAAL.KmeansStep1Mapper.java
License:Open Source License
@Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { /* Read a data set */ String filePath = "/Hadoop/Kmeans/data/" + value; double[] data = new double[nFeatures * nVectorsInBlock]; readData(filePath, nFeatures, nVectorsInBlock, data); DaalContext daalContext = new DaalContext(); HomogenNumericTable ntData = new HomogenNumericTable(daalContext, data, nFeatures, nVectorsInBlock); /* Create an algorithm to compute k-means on local nodes */ DistributedStep1Local kmeansLocal = new DistributedStep1Local(daalContext, Double.class, Method.defaultDense, nClusters); /* Get the centroids table computed in step 2 */ SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path("/Hadoop/Kmeans/initResults/centroids"))); IntWritable step1key = new IntWritable(); WriteableData step1value = new WriteableData(); reader.next(step1key, step1value);//from www.j a va 2 s . c om reader.close(); HomogenNumericTable centroids = (HomogenNumericTable) step1value.getObject(daalContext); /* Set the algorithm parameters */ kmeansLocal.input.set(InputId.data, ntData); kmeansLocal.input.set(InputId.inputCentroids, centroids); /* Compute k-means on local nodes */ PartialResult pres = kmeansLocal.compute(); /* Write the data prepended with a data set sequence number. Needed to know the position of the data set in the input data */ context.write(new IntWritable(0), new WriteableData(index, pres)); daalContext.dispose(); index += totalTasks; }
From source file:DAAL.QRStep3Mapper.java
License:Open Source License
@Override public void map(IntWritable step2key, WriteableData step2value, Context context) throws IOException, InterruptedException { DaalContext daalContext = new DaalContext(); SequenceFile.Reader reader = new SequenceFile.Reader(new Configuration(), SequenceFile.Reader.file(new Path("/Hadoop/QR/step1/step1x" + step2value.getId()))); IntWritable step1key = new IntWritable(); WriteableData step1value = new WriteableData(); reader.next(step1key, step1value);/*from w ww.j a va2 s.c o m*/ reader.close(); DataCollection s1 = (DataCollection) step1value.getObject(daalContext); DataCollection s2 = (DataCollection) step2value.getObject(daalContext); /* Create an algorithm to compute QR decomposition on the master node */ DistributedStep3Local qrStep3Local = new DistributedStep3Local(daalContext, Double.class, Method.defaultDense); qrStep3Local.input.set(DistributedStep3LocalInputId.inputOfStep3FromStep1, s1); qrStep3Local.input.set(DistributedStep3LocalInputId.inputOfStep3FromStep2, s2); /* Compute QR decomposition in step 3 */ qrStep3Local.compute(); Result result = qrStep3Local.finalizeCompute(); HomogenNumericTable Qi = (HomogenNumericTable) result.get(ResultId.matrixQ); SequenceFile.Writer writer = SequenceFile.createWriter(new Configuration(), SequenceFile.Writer.file(new Path("/Hadoop/QR/Output/Qx" + step2value.getId())), SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(WriteableData.class)); writer.append(new IntWritable(0), new WriteableData(step2value.getId(), Qi)); writer.close(); daalContext.dispose(); }