List of usage examples for org.apache.hadoop.mapreduce OutputFormat getRecordWriter
public abstract RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException;
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
@SuppressWarnings("unchecked") private synchronized RecordWriter getRecordWriter(String namedOutput) throws IOException, InterruptedException { // look for record-writer in the cache RecordWriter writer = recordWriters.get(namedOutput); // If not in cache, create a new one if (writer == null) { // get the record writer from context output format TaskAttemptContext taskContext = getContext(namedOutput); Class<? extends OutputFormat<?, ?>> outputFormatClass; try {//from w w w . ja v a 2 s .c om outputFormatClass = taskContext.getOutputFormatClass(); } catch (ClassNotFoundException e) { throw new IOException(e); } ClassLoader outputFormatClassLoader = outputFormatClass.getClassLoader(); // This is needed in case the OutputFormat's classloader conflicts with the program classloader (for example, // TableOutputFormat). ClassLoader oldClassLoader = ClassLoaders.setContextClassLoader(outputFormatClassLoader); try { // We use ReflectionUtils to instantiate the OutputFormat, because it also calls setConf on the object, if it // is a org.apache.hadoop.conf.Configurable. OutputFormat<?, ?> outputFormat = ReflectionUtils.newInstance(outputFormatClass, taskContext.getConfiguration()); writer = new MeteredRecordWriter<>(outputFormat.getRecordWriter(taskContext), context); } finally { ClassLoaders.setContextClassLoader(oldClassLoader); } // add the record-writer to the cache recordWriters.put(namedOutput, writer); } return writer; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputsMainOutputWrapper.java
License:Apache License
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { OutputFormat<K, V> rootOutputFormat = getRootOutputFormat(job); return rootOutputFormat.getRecordWriter(job); }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) private void runMap(Job job, KeyValueSorter<?, ?> sorter) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = job.getConfiguration(); InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = input.getSplits(job); int serial = 1; for (InputSplit split : splits) { TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0); Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$ mapper.getClass().getName(), id, split.getLength())); }//from ww w. j a va2 s. c om TaskAttemptContext context = newTaskAttemptContext(conf, id); // we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); OutputCommitter committer = output.getOutputCommitter(context); committer.setupTask(context); boolean succeed = false; try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) { RecordWriter<?, ?> writer; if (sorter != null) { writer = new ShuffleWriter(sorter); } else { writer = output.getRecordWriter(newTaskAttemptContext(conf, id)); } try { Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split); reader.initialize(split, c); mapper.run(c); } finally { writer.close(newTaskAttemptContext(conf, id)); } doCommitTask(context, committer); succeed = true; } finally { if (succeed == false) { doAbortTask(context, committer); } } } }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) private void runReduce(Job job, KeyValueSorter<?, ?> sorter) throws ClassNotFoundException, IOException, InterruptedException { Configuration conf = job.getConfiguration(); OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); TaskAttemptID id = newTaskAttemptId(newReduceTaskId(job.getJobID(), 1), 0); Reducer<?, ?, ?, ?> reducer = ReflectionUtils.newInstance(job.getReducerClass(), conf); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("starting reducer: {0}@{1} ({2}records, {3}bytes)", //$NON-NLS-1$ reducer.getClass().getName(), id, sorter.getRecordCount(), sorter.getSizeInBytes())); }//from w w w . j av a 2 s . c o m TaskAttemptContext context = newTaskAttemptContext(conf, id); OutputCommitter committer = output.getOutputCommitter(context); committer.setupTask(context); boolean succeed = false; try { ShuffleReader reader = new ShuffleReader(sorter, new Progress()); try { RecordWriter<?, ?> writer = output.getRecordWriter(newTaskAttemptContext(conf, id)); try { Reducer.Context c = newReducerContext(conf, id, reader, sorter.getKeyClass(), sorter.getValueClass(), writer, committer, (RawComparator) job.getGroupingComparator()); reducer.run(c); } finally { writer.close(newTaskAttemptContext(conf, id)); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn(MessageFormat.format("error occurred while reducer mapper input: {0} ({1})", id, job.getJobName()), e); } } doCommitTask(context, committer); succeed = true; } finally { if (succeed == false) { doAbortTask(context, committer); } } }
From source file:com.asakusafw.runtime.stage.output.StageOutputDriver.java
License:Apache License
private ResultOutput<?> buildNormalSink(String name, @SuppressWarnings("rawtypes") Class<? extends OutputFormat> formatClass, Class<?> keyClass, Class<?> valueClass, List<Counter> counters) throws IOException, InterruptedException { assert context != null; assert name != null; assert formatClass != null; assert keyClass != null; assert valueClass != null; assert counters != null; Job job = JobCompatibility.newJob(context.getConfiguration()); job.setOutputFormatClass(formatClass); job.setOutputKeyClass(keyClass);/*w w w .j a v a 2 s . c o m*/ job.setOutputValueClass(valueClass); TaskAttemptContext localContext = JobCompatibility.newTaskAttemptContext(job.getConfiguration(), context.getTaskAttemptID()); if (FileOutputFormat.class.isAssignableFrom(formatClass)) { setOutputFilePrefix(localContext, name); } OutputFormat<?, ?> format = ReflectionUtils.newInstance(formatClass, localContext.getConfiguration()); RecordWriter<?, ?> writer = format.getRecordWriter(localContext); return new ResultOutput<Writable>(localContext, writer); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
public synchronized RecordWriter getRecordWriter(String baseFileName) throws IOException, InterruptedException { // Look for record-writer in the cache OutputContext context = outputContexts.get(baseFileName); // If not in cache, create a new one if (context == null) { context = new OutputContext(); OutputFormat mainOutputFormat;/*from ww w .ja va 2 s.co m*/ try { mainOutputFormat = ((OutputFormat) ReflectionUtils.newInstance(this.context.getOutputFormatClass(), this.context.getConfiguration())); } catch (ClassNotFoundException e1) { throw new RuntimeException(e1); } ProxyOutputCommitter baseOutputCommitter = ((ProxyOutputCommitter) mainOutputFormat .getOutputCommitter(this.context)); // The trick is to create a new Job for each output Job job = new Job(this.context.getConfiguration()); job.setOutputKeyClass(getNamedOutputKeyClass(this.context, baseFileName)); job.setOutputValueClass(getNamedOutputValueClass(this.context, baseFileName)); // Check possible specific context for the output setSpecificNamedOutputContext(this.context.getConfiguration(), job, baseFileName); TaskAttemptContext taskContext; try { taskContext = TaskAttemptContextFactory.get(job.getConfiguration(), this.context.getTaskAttemptID()); } catch (Exception e) { throw new IOException(e); } // First we change the output dir for the new OutputFormat that we will // create // We put it inside the main output work path -> in case the Job fails, // everything will be discarded taskContext.getConfiguration().set("mapred.output.dir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); // This is for Hadoop 2.0 : taskContext.getConfiguration().set("mapreduce.output.fileoutputformat.outputdir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); context.taskAttemptContext = taskContext; // Load the OutputFormat instance OutputFormat outputFormat = InstancesDistributor.loadInstance( context.taskAttemptContext.getConfiguration(), OutputFormat.class, getNamedOutputFormatInstanceFile(this.context, baseFileName), true); // We have to create a JobContext for meeting the contract of the // OutputFormat JobContext jobContext; try { jobContext = JobContextFactory.get(taskContext.getConfiguration(), taskContext.getJobID()); } catch (Exception e) { throw new IOException(e); } context.jobContext = jobContext; // The contract of the OutputFormat is to check the output specs outputFormat.checkOutputSpecs(jobContext); // We get the output committer so we can call it later context.outputCommitter = outputFormat.getOutputCommitter(taskContext); // Save the RecordWriter to cache it context.recordWriter = outputFormat.getRecordWriter(taskContext); // if counters are enabled, wrap the writer with context // to increment counters if (countersEnabled) { context.recordWriter = new RecordWriterWithCounter(context.recordWriter, baseFileName, this.context); } outputContexts.put(baseFileName, context); } return context.recordWriter; }
From source file:com.marklogic.contentpump.LocalJobRunner.java
License:Apache License
/** * Run the job. Get the input splits, create map tasks and submit it to * the thread pool if there is one; otherwise, runs the the task one by * one./*from w w w. j a v a2 s. c o m*/ * * @param <INKEY> * @param <INVALUE> * @param <OUTKEY> * @param <OUTVALUE> * @throws Exception */ @SuppressWarnings("unchecked") public <INKEY, INVALUE, OUTKEY, OUTVALUE, T extends org.apache.hadoop.mapreduce.InputSplit> void run() throws Exception { Configuration conf = job.getConfiguration(); InputFormat<INKEY, INVALUE> inputFormat = (InputFormat<INKEY, INVALUE>) ReflectionUtils .newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = inputFormat.getSplits(job); T[] array = (T[]) splits.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits.size()]); // sort the splits into order based on size, so that the biggest // goes first Arrays.sort(array, new SplitLengthComparator()); OutputFormat<OUTKEY, OUTVALUE> outputFormat = (OutputFormat<OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(job.getOutputFormatClass(), conf); Class<? extends Mapper<?, ?, ?, ?>> mapperClass = job.getMapperClass(); Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(mapperClass, conf); try { outputFormat.checkOutputSpecs(job); } catch (Exception ex) { if (LOG.isDebugEnabled()) { LOG.debug("Error checking output specification: ", ex); } else { LOG.error("Error checking output specification: "); LOG.error(ex.getMessage()); } return; } conf = job.getConfiguration(); progress = new AtomicInteger[splits.size()]; for (int i = 0; i < splits.size(); i++) { progress[i] = new AtomicInteger(); } Monitor monitor = new Monitor(); monitor.start(); reporter = new ContentPumpReporter(); List<Future<Object>> taskList = new ArrayList<Future<Object>>(); for (int i = 0; i < array.length; i++) { InputSplit split = array[i]; if (pool != null) { LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE> task = new LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE>( inputFormat, outputFormat, conf, i, split, reporter, progress[i]); availableThreads = assignThreads(i, array.length); Class<? extends Mapper<?, ?, ?, ?>> runtimeMapperClass = job.getMapperClass(); if (availableThreads > 1 && availableThreads != threadsPerSplit) { // possible runtime adjustment if (runtimeMapperClass != (Class) MultithreadedMapper.class) { runtimeMapperClass = (Class<? extends Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>>) cmd .getRuntimeMapperClass(job, mapperClass, threadsPerSplit, availableThreads); } if (runtimeMapperClass != mapperClass) { task.setMapperClass(runtimeMapperClass); } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { task.setThreadCount(availableThreads); if (LOG.isDebugEnabled()) { LOG.debug("Thread Count for Split#" + i + " : " + availableThreads); } } } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { synchronized (pool) { taskList.add(pool.submit(task)); pool.wait(); } } else { pool.submit(task); } } else { // single-threaded JobID jid = new JobID(); TaskID taskId = new TaskID(jid.getJtIdentifier(), jid.getId(), TaskType.MAP, i); TaskAttemptID taskAttemptId = new TaskAttemptID(taskId, 0); TaskAttemptContext context = ReflectionUtil.createTaskAttemptContext(conf, taskAttemptId); RecordReader<INKEY, INVALUE> reader = inputFormat.createRecordReader(split, context); RecordWriter<OUTKEY, OUTVALUE> writer = outputFormat.getRecordWriter(context); OutputCommitter committer = outputFormat.getOutputCommitter(context); TrackingRecordReader trackingReader = new TrackingRecordReader(reader, progress[i]); Mapper.Context mapperContext = ReflectionUtil.createMapperContext(mapper, conf, taskAttemptId, trackingReader, writer, committer, reporter, split); trackingReader.initialize(split, mapperContext); // no thread pool (only 1 thread specified) Class<? extends Mapper<?, ?, ?, ?>> mapClass = job.getMapperClass(); mapperContext.getConfiguration().setClass(CONF_MAPREDUCE_JOB_MAP_CLASS, mapClass, Mapper.class); mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils.newInstance(mapClass, mapperContext.getConfiguration()); mapper.run(mapperContext); trackingReader.close(); writer.close(mapperContext); committer.commitTask(context); } } // wait till all tasks are done if (pool != null) { for (Future<Object> f : taskList) { f.get(); } pool.shutdown(); while (!pool.awaitTermination(1, TimeUnit.DAYS)) ; jobComplete.set(true); } monitor.interrupt(); monitor.join(1000); // report counters Iterator<CounterGroup> groupIt = reporter.counters.iterator(); while (groupIt.hasNext()) { CounterGroup group = groupIt.next(); LOG.info(group.getDisplayName() + ": "); Iterator<Counter> counterIt = group.iterator(); while (counterIt.hasNext()) { Counter counter = counterIt.next(); LOG.info(counter.getDisplayName() + ": " + counter.getValue()); } } LOG.info("Total execution time: " + (System.currentTimeMillis() - startTime) / 1000 + " sec"); }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapreduce.java
License:Apache License
/** * Runs mapper for the single split.//from w w w .j av a 2 s. c o m * * @param mapOutputAccumulator mapOutputAccumulator to use * @param split split ot run on */ @Override @SuppressWarnings("unchecked") public void runSplit(MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex) throws IOException, ClassNotFoundException, InterruptedException { TaskAttemptID taskAttemptId = hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex); //Setup task ID info TaskAttemptContext taskContext = hadoopVersionSpecificCode.createTaskAttemptContext(configuration, taskAttemptId); InputFormat inputFormat = ReflectionUtils.newInstance(jobContext.getInputFormatClass(), configuration); //Create RecordReader org.apache.hadoop.mapreduce.RecordReader<INKEY, INVALUE> input = inputFormat .createRecordReader((InputSplit) split, taskContext); //Make a mapper org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper; try { mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor .newInstance(); } catch (Exception e) { throw new RuntimeException(e); } org.apache.hadoop.mapreduce.RecordWriter output; OutputCommitter committer = null; if (mapOnlyJob) { OutputFormat outputFormat = ReflectionUtils.newInstance(jobContext.getOutputFormatClass(), configuration); output = (org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(taskContext); committer = outputFormat.getOutputCommitter(taskContext); committer.setupTask(taskContext); } else { output = new MapOutputCollector<OUTKEY, OUTVALUE>(mapOutputAccumulator); } input.initialize((InputSplit) split, taskContext); org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>.Context mapperContext = hadoopVersionSpecificCode .getMapperContext(configuration, taskAttemptId, input, output); mapper.run(mapperContext); input.close(); output.close(mapperContext); if (mapOnlyJob && committer != null) { committer.commitTask(taskContext); } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapreduce.java
License:Apache License
public ReducerWrapperMapreduce(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId, int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException { this.invocationParameters = invocationParameters; Configuration configuration = (Configuration) invocationParameters.getConfiguration(); hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(), configuration);/* w w w . ja va2s . c om*/ JobID jobID = (JobID) invocationParameters.getJobId(); //Setup task ID info TaskAttemptID id = hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition); JobContext jobContext = hadoopVersionSpecificCode.createJobContext(new JobConf(configuration), jobID); taskContext = hadoopVersionSpecificCode.createTaskAttemptContext(configuration, id); reducer = (org.apache.hadoop.mapreduce.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(jobContext.getReducerClass(), configuration); OutputFormat outputFormat = ReflectionUtils.newInstance(jobContext.getOutputFormatClass(), configuration); recordWriter = (org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(taskContext); committer = outputFormat.getOutputCommitter(taskContext); committer.setupTask(taskContext); Class<INKEY> keyClass = (Class<INKEY>) jobContext.getMapOutputKeyClass(); WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); Class<INVALUE> valueClass = (Class<INVALUE>) jobContext.getMapOutputValueClass(); WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>( valueClass, null); DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region, appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, configuration) > 0, firstKeySerializer, valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass, valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, configuration), 1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, configuration), HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, configuration)); DataGridChunkedCollectionReader<INKEY, INVALUE> transport = DataGridChunkedCollectionReader .getGridReader(params); context = hadoopVersionSpecificCode.getReducerContext(configuration, id, committer, recordWriter, transport, null); }
From source file:com.yahoo.glimmer.indexing.generator.IndexRecordWriterTest.java
License:Open Source License
@Test public void test() throws Exception { context.checking(new Expectations() { {//from w w w .j a va 2 s.co m allowing(taskContext).getConfiguration(); will(returnValue(conf)); allowing(taskContext).getTaskAttemptID(); will(returnValue(taskAttemptID)); } }); OutputFormat outputFormat = new IndexRecordWriter.OutputFormat(); conf.setStrings("RdfFieldNames", "index0", "index1"); conf.setEnum("IndexType", RDFDocumentFactory.IndexType.VERTICAL); RecordWriter<IntWritable, IndexRecordWriterValue> recordWriter = outputFormat.getRecordWriter(taskContext); IntWritable key = new IntWritable(); IndexRecordWriterTermValue termValue = new IndexRecordWriterTermValue(); IndexRecordWriterDocValue docValue = new IndexRecordWriterDocValue(); IndexRecordWriterSizeValue sizeValue = new IndexRecordWriterSizeValue(); // ALIGNEMENT_INDEX key.set(DocumentMapper.ALIGNMENT_INDEX); termValue.setTerm("term1"); termValue.setTermFrequency(1); // The alignment index doesn't have positions/counts. termValue.setOccurrenceCount(0); termValue.setSumOfMaxTermPositions(0); recordWriter.write(key, termValue); docValue.setDocument(0); // term1 occurs in index 0 recordWriter.write(key, docValue); // Index 0 key.set(0); termValue.setTermFrequency(3); termValue.setOccurrenceCount(6); termValue.setSumOfMaxTermPositions(15 + 12 + 18); recordWriter.write(key, termValue); docValue.setDocument(3); docValue.clearOccerrences(); docValue.addOccurrence(11); docValue.addOccurrence(15); recordWriter.write(key, docValue); docValue.setDocument(4); docValue.clearOccerrences(); docValue.addOccurrence(12); recordWriter.write(key, docValue); docValue.setDocument(7); docValue.clearOccerrences(); docValue.addOccurrence(14); docValue.addOccurrence(17); docValue.addOccurrence(18); recordWriter.write(key, docValue); // ALIGNEMENT_INDEX key.set(DocumentMapper.ALIGNMENT_INDEX); termValue.setTerm("term2"); termValue.setTermFrequency(2); // The alignment index doesn't have positions/counts. termValue.setOccurrenceCount(0); termValue.setSumOfMaxTermPositions(0); recordWriter.write(key, termValue); docValue.clearOccerrences(); docValue.setDocument(0); // term2 occurs in index 0 & 1 recordWriter.write(key, docValue); docValue.setDocument(1); // term2 occurs in index 0 & 1 recordWriter.write(key, docValue); // Index 0 key.set(0); termValue.setTermFrequency(2); termValue.setOccurrenceCount(4); termValue.setSumOfMaxTermPositions(19 + 16); recordWriter.write(key, termValue); docValue.setDocument(1); docValue.clearOccerrences(); docValue.addOccurrence(10); docValue.addOccurrence(19); recordWriter.write(key, docValue); docValue.setDocument(7); docValue.clearOccerrences(); docValue.addOccurrence(13); docValue.addOccurrence(16); recordWriter.write(key, docValue); // Index 1 key.set(1); termValue.setTermFrequency(1); termValue.setOccurrenceCount(1); termValue.setSumOfMaxTermPositions(14); recordWriter.write(key, termValue); docValue.setDocument(1); docValue.clearOccerrences(); docValue.addOccurrence(14); recordWriter.write(key, docValue); // ALIGNMENT_INDEX key.set(DocumentMapper.ALIGNMENT_INDEX); termValue.setTerm("term3"); termValue.setTermFrequency(1); // The alignment index doesn't have positions/counts. termValue.setOccurrenceCount(0); termValue.setSumOfMaxTermPositions(0); recordWriter.write(key, termValue); docValue.setDocument(1); // term3 occurs in index 1 recordWriter.write(key, docValue); docValue.clearOccerrences(); // Index 1 key.set(1); termValue.setTermFrequency(1); termValue.setOccurrenceCount(2); termValue.setSumOfMaxTermPositions(11); recordWriter.write(key, termValue); docValue.setDocument(3); docValue.clearOccerrences(); docValue.addOccurrence(10); docValue.addOccurrence(11); recordWriter.write(key, docValue); // Doc Sizes. key.set(0); sizeValue.setDocument(0); sizeValue.setSize(3); recordWriter.write(key, sizeValue); sizeValue.setDocument(3); sizeValue.setSize(1); recordWriter.write(key, sizeValue); sizeValue.setDocument(4); sizeValue.setSize(10); recordWriter.write(key, sizeValue); sizeValue.setDocument(6); sizeValue.setSize(2); recordWriter.write(key, sizeValue); key.set(1); sizeValue.setDocument(3); sizeValue.setSize(3); recordWriter.write(key, sizeValue); sizeValue.setDocument(6); sizeValue.setSize(5); recordWriter.write(key, sizeValue); recordWriter.close(taskContext); // Check the written indexes.. Path workPath = outputFormat.getDefaultWorkFile(taskContext, ""); System.out.println("Default work file is " + workPath.toString()); String dir = workPath.toUri().getPath(); BitStreamIndex index0 = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/index0", true, true); assertEquals(8, index0.numberOfDocuments); assertEquals(2, index0.numberOfTerms); assertTrue(index0.hasPositions); // term1 checkOccurrences(index0.documents(0), 3, "(3:11,15) (4:12) (7:14,17,18)"); // term2 checkOccurrences(index0.documents(1), 2, "(1:10,19) (7:13,16)"); assertEquals("[3, 0, 0, 1, 10, 0, 2, 0]", index0.sizes.toString()); BitStreamIndex index1 = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/index1", true, true); assertEquals(8, index1.numberOfDocuments); assertEquals(2, index1.numberOfTerms); assertTrue(index0.hasPositions); checkOccurrences(index1.documents(0), 1, "(1:14)"); // term3 checkOccurrences(index1.documents(1), 1, "(3:10,11)"); BitStreamIndex indexAlignment = (BitStreamIndex) DiskBasedIndex.getInstance(dir + "/alignment", true); assertEquals(8, indexAlignment.numberOfDocuments); assertEquals(3, indexAlignment.numberOfTerms); assertFalse(indexAlignment.hasPositions); // term1 assertEquals(1, indexAlignment.documents(0).frequency()); // term2 assertEquals(2, indexAlignment.documents(1).frequency()); // term3 assertEquals(1, indexAlignment.documents(2).frequency()); assertEquals("[0, 0, 0, 3, 0, 0, 5, 0]", index1.sizes.toString()); }