List of usage examples for org.apache.hadoop.mapreduce RecordReader getCurrentKey
public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;
From source file:com.splout.db.hadoop.TupleSampler.java
License:Apache License
/** * Random sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit * without using a Job.//from w ww.j ava 2s . c o m * The output is SequenceFile with keys. * * @return The number of retrieved samples */ private long randomSampling(long sampleSize, Configuration hadoopConf, Path outFile, List<InputSplit> splits, Map<InputSplit, TableSpec> splitToTableSpec, Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat, Map<InputSplit, Map<String, String>> specificHadoopConf, Map<InputSplit, RecordProcessor> recordProcessorPerSplit, Map<InputSplit, JavascriptEngine> splitToJsEngine, int maxSplitsToVisit) throws IOException { // Instantiate the writer we will write samples to FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf); if (splits.size() == 0) { throw new IllegalArgumentException("There are no splits to sample from!"); } @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, hadoopConf, outFile, Text.class, NullWritable.class); logger.info("Sequential sampling options, max splits to visit: " + maxSplitsToVisit + ", samples to take: " + sampleSize + ", total number of splits: " + splits.size()); int blocks = Math.min(maxSplitsToVisit, splits.size()); blocks = Math.min((int) sampleSize, blocks); long recordsPerSample = sampleSize / blocks; int sampleStep = splits.size() / blocks; long records = 0; CounterInterface counterInterface = new CounterInterface(null) { public Counter getCounter(String group, String name) { return Mockito.mock(Counter.class); } ; }; // Take N samples from different parts of the input for (int i = 0; i < blocks; ++i) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = null; try { attemptContext = TaskAttemptContextFactory.get(hadoopConf, attemptId); } catch (Exception e) { throw new RuntimeException(e); } InputSplit split = splits.get(sampleStep * i); if (specificHadoopConf.get(split) != null) { for (Map.Entry<String, String> specificConf : specificHadoopConf.get(split).entrySet()) { attemptContext.getConfiguration().set(specificConf.getKey(), specificConf.getValue()); } } logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = null; try { reader = splitToFormat.get(split).createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); RecordProcessor processor = recordProcessorPerSplit.get(split); Text key = new Text(); while (reader.nextKeyValue()) { // ITuple tuple = reader.getCurrentKey(); ITuple uTuple; try { uTuple = processor.process(tuple, tuple.getSchema().getName(), counterInterface); } catch (Throwable e) { throw new RuntimeException(e); } if (uTuple != null) { // user may have filtered the record try { key.set(TablespaceGenerator.getPartitionByKey(uTuple, splitToTableSpec.get(split), splitToJsEngine.get(split))); } catch (Throwable e) { throw new RuntimeException("Error when determining partition key.", e); } writer.append(key, NullWritable.get()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } } catch (InterruptedException e) { throw new RuntimeException(e); } } writer.close(); return records; }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext);/*w w w. j ava 2 s . c om*/ recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
From source file:edu.uci.ics.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final List<FileSplit> inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); private ContextFactory ctxFactory = new ContextFactory(); @SuppressWarnings("unchecked") @Override/*from ww w . j a va 2 s . c o m*/ public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); Job job = confFactory.getConf(); job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); writer.open(); InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read synchronize among * simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue() == true) { parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString()); } } } parser.close(writer); writer.close(); } catch (Exception e) { throw new HyracksDataException(e); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } } }; }
From source file:edu.umn.cs.spatialHadoop.visualization.MultilevelPlot.java
License:Open Source License
private static void plotLocal(Path[] inFiles, final Path outPath, final Class<? extends Plotter> plotterClass, final OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { final boolean vflip = params.getBoolean("vflip", true); OperationsParams mbrParams = new OperationsParams(params); mbrParams.setBoolean("background", false); final Rectangle inputMBR = params.get("mbr") != null ? params.getShape("mbr").getMBR() : FileMBR.fileMBR(inFiles, mbrParams); OperationsParams.setShape(params, InputMBR, inputMBR); // Retrieve desired output image size and keep aspect ratio if needed int tileWidth = params.getInt("tilewidth", 256); int tileHeight = params.getInt("tileheight", 256); // Adjust width and height if aspect ratio is to be kept if (params.getBoolean("keepratio", true)) { // Expand input file to a rectangle for compatibility with the pyramid // structure if (inputMBR.getWidth() > inputMBR.getHeight()) { inputMBR.y1 -= (inputMBR.getWidth() - inputMBR.getHeight()) / 2; inputMBR.y2 = inputMBR.y1 + inputMBR.getWidth(); } else {//from w ww . ja v a2 s . c o m inputMBR.x1 -= (inputMBR.getHeight() - inputMBR.getWidth()) / 2; inputMBR.x2 = inputMBR.x1 + inputMBR.getHeight(); } } String outFName = outPath.getName(); int extensionStart = outFName.lastIndexOf('.'); final String extension = extensionStart == -1 ? ".png" : outFName.substring(extensionStart); // Start reading input file Vector<InputSplit> splits = new Vector<InputSplit>(); final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>(); for (Path inFile : inFiles) { FileSystem inFs = inFile.getFileSystem(params); if (!OperationsParams.isWildcard(inFile) && inFs.exists(inFile) && !inFs.isDirectory(inFile)) { if (SpatialSite.NonHiddenFileFilter.accept(inFile)) { // Use the normal input format splitter to add this non-hidden file Job job = Job.getInstance(params); SpatialInputFormat3.addInputPath(job, inFile); splits.addAll(inputFormat.getSplits(job)); } else { // A hidden file, add it immediately as one split // This is useful if the input is a hidden file which is automatically // skipped by FileInputFormat. We need to plot a hidden file for the case // of plotting partition boundaries of a spatial index splits.add(new FileSplit(inFile, 0, inFs.getFileStatus(inFile).getLen(), new String[0])); } } else { Job job = Job.getInstance(params); SpatialInputFormat3.addInputPath(job, inFile); splits.addAll(inputFormat.getSplits(job)); } } try { Plotter plotter = plotterClass.newInstance(); plotter.configure(params); String[] strLevels = params.get("levels", "7").split("\\.\\."); int minLevel, maxLevel; if (strLevels.length == 1) { minLevel = 0; maxLevel = Integer.parseInt(strLevels[0]); } else { minLevel = Integer.parseInt(strLevels[0]); maxLevel = Integer.parseInt(strLevels[1]); } GridInfo bottomGrid = new GridInfo(inputMBR.x1, inputMBR.y1, inputMBR.x2, inputMBR.y2); bottomGrid.rows = bottomGrid.columns = 1 << maxLevel; TileIndex key = new TileIndex(); // All canvases in the pyramid, one per tile Map<TileIndex, Canvas> canvases = new HashMap<TileIndex, Canvas>(); for (InputSplit split : splits) { FileSplit fsplit = (FileSplit) split; RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, params); } else { throw new RuntimeException("Unknown record reader"); } while (reader.nextKeyValue()) { Rectangle partition = reader.getCurrentKey(); if (!partition.isValid()) partition.set(inputMBR); Iterable<Shape> shapes = reader.getCurrentValue(); for (Shape shape : shapes) { Rectangle shapeMBR = shape.getMBR(); if (shapeMBR == null) continue; java.awt.Rectangle overlappingCells = bottomGrid.getOverlappingCells(shapeMBR); // Iterate over levels from bottom up for (key.level = maxLevel; key.level >= minLevel; key.level--) { for (key.x = overlappingCells.x; key.x < overlappingCells.x + overlappingCells.width; key.x++) { for (key.y = overlappingCells.y; key.y < overlappingCells.y + overlappingCells.height; key.y++) { Canvas canvas = canvases.get(key); if (canvas == null) { Rectangle tileMBR = new Rectangle(); int gridSize = 1 << key.level; tileMBR.x1 = (inputMBR.x1 * (gridSize - key.x) + inputMBR.x2 * key.x) / gridSize; tileMBR.x2 = (inputMBR.x1 * (gridSize - (key.x + 1)) + inputMBR.x2 * (key.x + 1)) / gridSize; tileMBR.y1 = (inputMBR.y1 * (gridSize - key.y) + inputMBR.y2 * key.y) / gridSize; tileMBR.y2 = (inputMBR.y1 * (gridSize - (key.y + 1)) + inputMBR.y2 * (key.y + 1)) / gridSize; canvas = plotter.createCanvas(tileWidth, tileHeight, tileMBR); canvases.put(key.clone(), canvas); } plotter.plot(canvas, shape); } } // Update overlappingCells for the higher level int updatedX1 = overlappingCells.x / 2; int updatedY1 = overlappingCells.y / 2; int updatedX2 = (overlappingCells.x + overlappingCells.width - 1) / 2; int updatedY2 = (overlappingCells.y + overlappingCells.height - 1) / 2; overlappingCells.x = updatedX1; overlappingCells.y = updatedY1; overlappingCells.width = updatedX2 - updatedX1 + 1; overlappingCells.height = updatedY2 - updatedY1 + 1; } } } reader.close(); } // Done with all splits. Write output to disk LOG.info("Done with plotting. Now writing the output"); final FileSystem outFS = outPath.getFileSystem(params); LOG.info("Writing default empty image"); // Write a default empty image to be displayed for non-generated tiles BufferedImage emptyImg = new BufferedImage(tileWidth, tileHeight, BufferedImage.TYPE_INT_ARGB); Graphics2D g = new SimpleGraphics(emptyImg); g.setBackground(new Color(0, 0, 0, 0)); g.clearRect(0, 0, tileWidth, tileHeight); g.dispose(); // Write HTML file to browse the mutlielvel image OutputStream out = outFS.create(new Path(outPath, "default.png")); ImageIO.write(emptyImg, "png", out); out.close(); // Add an HTML file that visualizes the result using Google Maps LOG.info("Writing the HTML viewer file"); LineReader templateFileReader = new LineReader( MultilevelPlot.class.getResourceAsStream("/zoom_view.html")); PrintStream htmlOut = new PrintStream(outFS.create(new Path(outPath, "index.html"))); Text line = new Text(); while (templateFileReader.readLine(line) > 0) { String lineStr = line.toString(); lineStr = lineStr.replace("#{TILE_WIDTH}", Integer.toString(tileWidth)); lineStr = lineStr.replace("#{TILE_HEIGHT}", Integer.toString(tileHeight)); lineStr = lineStr.replace("#{MAX_ZOOM}", Integer.toString(maxLevel)); lineStr = lineStr.replace("#{MIN_ZOOM}", Integer.toString(minLevel)); lineStr = lineStr.replace("#{TILE_URL}", "'tile-' + zoom + '-' + coord.x + '-' + coord.y + '" + extension + "'"); htmlOut.println(lineStr); } templateFileReader.close(); htmlOut.close(); // Write the tiles final Entry<TileIndex, Canvas>[] entries = canvases.entrySet().toArray(new Map.Entry[canvases.size()]); // Clear the hash map to save memory as it is no longer needed canvases.clear(); int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors()); Parallel.forEach(entries.length, new RunnableRange<Object>() { @Override public Object run(int i1, int i2) { boolean output = params.getBoolean("output", true); try { Plotter plotter = plotterClass.newInstance(); plotter.configure(params); for (int i = i1; i < i2; i++) { Map.Entry<TileIndex, Canvas> entry = entries[i]; TileIndex key = entry.getKey(); if (vflip) key.y = ((1 << key.level) - 1) - key.y; Path imagePath = new Path(outPath, key.getImageFileName() + extension); // Write this tile to an image DataOutputStream outFile = output ? outFS.create(imagePath) : new DataOutputStream(new NullOutputStream()); plotter.writeImage(entry.getValue(), outFile, vflip); outFile.close(); // Remove entry to allows GC to collect it entries[i] = null; } return null; } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } }, parallelism); } catch (InstantiationException e) { throw new RuntimeException("Error creating rastierizer", e); } catch (IllegalAccessException e) { throw new RuntimeException("Error creating rastierizer", e); } }
From source file:edu.umn.cs.spatialHadoop.visualization.SingleLevelPlot.java
License:Open Source License
public static void plotLocal(Path[] inFiles, Path outFile, final Class<? extends Plotter> plotterClass, final OperationsParams params) throws IOException, InterruptedException { OperationsParams mbrParams = new OperationsParams(params); mbrParams.setBoolean("background", false); final Rectangle inputMBR = params.get(InputMBR) != null ? params.getShape("mbr").getMBR() : FileMBR.fileMBR(inFiles, mbrParams); if (params.get(InputMBR) == null) OperationsParams.setShape(params, InputMBR, inputMBR); // Retrieve desired output image size and keep aspect ratio if needed int width = params.getInt("width", 1000); int height = params.getInt("height", 1000); if (params.getBoolean("keepratio", true)) { // Adjust width and height to maintain aspect ratio and store the adjusted // values back in params in case the caller needs to retrieve them if (inputMBR.getWidth() / inputMBR.getHeight() > (double) width / height) params.setInt("height", height = (int) (inputMBR.getHeight() * width / inputMBR.getWidth())); else/*from w w w.ja v a 2s .co m*/ params.setInt("width", width = (int) (inputMBR.getWidth() * height / inputMBR.getHeight())); } // Store width and height in final variables to make them accessible in parallel final int fwidth = width, fheight = height; // Start reading input file List<InputSplit> splits = new ArrayList<InputSplit>(); final SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>(); for (Path inFile : inFiles) { FileSystem inFs = inFile.getFileSystem(params); if (!OperationsParams.isWildcard(inFile) && inFs.exists(inFile) && !inFs.isDirectory(inFile)) { if (SpatialSite.NonHiddenFileFilter.accept(inFile)) { // Use the normal input format splitter to add this non-hidden file Job job = Job.getInstance(params); SpatialInputFormat3.addInputPath(job, inFile); splits.addAll(inputFormat.getSplits(job)); } else { // A hidden file, add it immediately as one split // This is useful if the input is a hidden file which is automatically // skipped by FileInputFormat. We need to plot a hidden file for the case // of plotting partition boundaries of a spatial index splits.add(new FileSplit(inFile, 0, inFs.getFileStatus(inFile).getLen(), new String[0])); } } else { // Use the normal input format splitter to add this non-hidden file Job job = Job.getInstance(params); SpatialInputFormat3.addInputPath(job, inFile); splits.addAll(inputFormat.getSplits(job)); } } // Copy splits to a final array to be used in parallel final FileSplit[] fsplits = splits.toArray(new FileSplit[splits.size()]); int parallelism = params.getInt("parallel", Runtime.getRuntime().availableProcessors()); List<Canvas> partialCanvases = Parallel.forEach(fsplits.length, new RunnableRange<Canvas>() { @Override public Canvas run(int i1, int i2) { Plotter plotter; try { plotter = plotterClass.newInstance(); } catch (InstantiationException e) { throw new RuntimeException("Error creating rastierizer", e); } catch (IllegalAccessException e) { throw new RuntimeException("Error creating rastierizer", e); } plotter.configure(params); // Create the partial layer that will contain the plot of the assigned partitions Canvas partialCanvas = plotter.createCanvas(fwidth, fheight, inputMBR); for (int i = i1; i < i2; i++) { try { RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplits[i], null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplits[i], params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplits[i], params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplits[i], params); } else { throw new RuntimeException("Unknown record reader"); } while (reader.nextKeyValue()) { Rectangle partition = reader.getCurrentKey(); if (!partition.isValid()) partition.set(inputMBR); Iterable<Shape> shapes = reader.getCurrentValue(); // Run the plot step plotter.plot(partialCanvas, plotter.isSmooth() ? plotter.smooth(shapes) : shapes); } reader.close(); } catch (IOException e) { throw new RuntimeException("Error reading the file ", e); } catch (InterruptedException e) { throw new RuntimeException("Interrupt error ", e); } } return partialCanvas; } }, parallelism); boolean merge = params.getBoolean("merge", true); Plotter plotter; try { plotter = plotterClass.newInstance(); plotter.configure(params); } catch (InstantiationException e) { throw new RuntimeException("Error creating plotter", e); } catch (IllegalAccessException e) { throw new RuntimeException("Error creating plotter", e); } // Whether we should vertically flip the final image or not boolean vflip = params.getBoolean("vflip", true); if (merge) { LOG.info("Merging " + partialCanvases.size() + " partial canvases"); // Create the final canvas that will contain the final image Canvas finalCanvas = plotter.createCanvas(fwidth, fheight, inputMBR); for (Canvas partialCanvas : partialCanvases) plotter.merge(finalCanvas, partialCanvas); // Finally, write the resulting image to the given output path LOG.info("Writing final image"); FileSystem outFs = outFile.getFileSystem(params); FSDataOutputStream outputFile = outFs.create(outFile); plotter.writeImage(finalCanvas, outputFile, vflip); outputFile.close(); } else { // No merge LOG.info("Writing partial images"); FileSystem outFs = outFile.getFileSystem(params); for (int i = 0; i < partialCanvases.size(); i++) { Path filename = new Path(outFile, String.format("part-%05d.png", i)); FSDataOutputStream outputFile = outFs.create(filename); plotter.writeImage(partialCanvases.get(i), outputFile, vflip); outputFile.close(); } } }
From source file:eu.scape_project.tb.wc.archd.test.ARCTest.java
License:Apache License
/** * Test of nextKeyValue method, of class ArcRecordReader. *///from www . ja v a 2 s .co m public void testNextKeyValue() throws Exception { RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac); recordReader.initialize(split, tac); int start = 1; while (recordReader.nextKeyValue()) { Text currKey = recordReader.getCurrentKey(); ArcRecord currValue = recordReader.getCurrentValue(); String currMIMEType = currValue.getMimeType(); String currType = currValue.getType(); String currURL = currValue.getUrl(); InputStream currStream = currValue.getContents(); String currContent; String myContentString; int myContentStringIndex; Date currDate = currValue.getDate(); int currHTTPrc = currValue.getHttpReturnCode(); int currLength = currValue.getLength(); System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: " + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc + " Length: " + currLength); // check example record 1 (first one and the header of the ARC file) if (start == 1) { //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream. //We will search for the string int the content we read and compare it to the values we know. currContent = content2String(currStream); myContentString = "defaultgz_orderxml"; myContentStringIndex = currContent.indexOf(myContentString); //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex); assertEquals("ID not equal", "20130522085320/filedesc://3-2-20130522085320-00000-prepc2.arc", currKey.toString()); assertEquals("MIME Type not equal", "text/plain", currMIMEType); assertEquals("Response type not equal", "response", currType); assertEquals("URL not equal", "filedesc://3-2-20130522085320-00000-prepc2.arc", currURL); assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 08:53:20")); assertEquals("HTTPrc not equal", -1, currHTTPrc); assertEquals("Record length not equal", 1190, currLength); assertEquals("Content seems not to be correct", 531, myContentStringIndex); } start++; } }
From source file:eu.scape_project.tb.wc.archd.test.WARCTest.java
License:Apache License
/** * Test of nextKeyValue method, of class ArcRecordReader. *///from w w w. j av a2 s .com public void testNextKeyValue() throws Exception { RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac); recordReader.initialize(split, tac); int start = 1; while (recordReader.nextKeyValue()) { Text currKey = recordReader.getCurrentKey(); ArcRecord currValue = recordReader.getCurrentValue(); String currMIMEType = currValue.getMimeType(); String currType = currValue.getType(); String currURL = currValue.getUrl(); InputStream currStream = currValue.getContents(); String currContent; String myContentString; int myContentStringIndex; Date currDate = currValue.getDate(); int currHTTPrc = currValue.getHttpReturnCode(); int currLength = currValue.getLength(); System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: " + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc + " Length: " + currLength); // check example record 1 (first one and the header of the WARC file) if (start == 1) { //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream. //We will search for the string int the content we read and compare it to the values we know. currContent = content2String(currStream); myContentString = "isPartOf: basic"; myContentStringIndex = currContent.indexOf(myContentString); //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex); assertEquals("ID not equal", "<urn:uuid:18cfb53d-1c89-4cc6-863f-e5535d430c95>", currKey.toString()); assertEquals("MIME Type not equal", "application/warc-fields", currMIMEType); assertEquals("Response type not equal", "warcinfo", currType); assertEquals("URL not equal", null, currURL); assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 12:27:40")); assertEquals("HTTPrc not equal", -1, currHTTPrc); assertEquals("Record length not equal", 374, currLength); assertEquals("Content mismatch", 202, myContentStringIndex); } start++; } }
From source file:gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormatTest.java
License:Apache License
@Test public void testRecordReader() throws Exception { List<String> paths = Lists.newArrayList("/path1", "/path2"); GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths); GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat(); RecordReader<LongWritable, Text> recordReader = inputFormat.createRecordReader(split, new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1, TaskType.MAP, 1, 1))); recordReader.nextKeyValue();// ww w. ja v a2s . co m Assert.assertEquals(recordReader.getCurrentKey().get(), 0); Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1"); recordReader.nextKeyValue(); Assert.assertEquals(recordReader.getCurrentKey().get(), 1); Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2"); Assert.assertFalse(recordReader.nextKeyValue()); }
From source file:info.halo9pan.word2vec.hadoop.mr.SortInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * /*from w ww. jav a2 s . c o m*/ * @param job * the job to sample * @param partFile * where to write the output file to * @throws Throwable * if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final SortInputFormat inFormat = new SortInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:it.crs4.pydoop.mapreduce.pipes.PipesMapper.java
License:Apache License
@Override public void run(Context context) throws IOException, InterruptedException { setup(context);//from w ww . j a va2s .co m Configuration conf = context.getConfiguration(); InputSplit split = context.getInputSplit(); // FIXME: do we really need to be so convoluted? InputFormat<K1, V1> inputFormat; try { inputFormat = (InputFormat<K1, V1>) ReflectionUtils.newInstance(context.getInputFormatClass(), conf); } catch (ClassNotFoundException ce) { throw new RuntimeException("class not found", ce); } RecordReader<K1, V1> input = inputFormat.createRecordReader(split, context); input.initialize(split, context); boolean isJavaInput = Submitter.getIsJavaRecordReader(conf); try { // FIXME: what happens for a java mapper and no java record reader? DummyRecordReader fakeInput = (!isJavaInput && !Submitter.getIsJavaMapper(conf)) ? (DummyRecordReader) input : null; application = new Application<K1, V1, K2, V2>(context, fakeInput); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol<K1, V1> downlink = application.getDownlink(); // FIXME: InputSplit is not Writable, but still, this is ugly... downlink.runMap((FileSplit) context.getInputSplit(), context.getNumReduceTasks(), isJavaInput); boolean skipping = conf.getBoolean(context.SKIP_RECORDS, false); boolean sent_input_types = false; try { if (isJavaInput) { // FIXME while (input.nextKeyValue()) { if (!sent_input_types) { sent_input_types = true; NullWritable n = NullWritable.get(); String kclass_name = n.getClass().getName(); String vclass_name = n.getClass().getName(); if (input.getCurrentKey() != null) { kclass_name = input.getCurrentKey().getClass().getName(); } if (input.getCurrentValue() != null) { vclass_name = input.getCurrentValue().getClass().getName(); } downlink.setInputTypes(kclass_name, vclass_name); } downlink.mapItem(input.getCurrentKey(), input.getCurrentValue()); if (skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.flush(); } } downlink.endOfInput(); } application.waitForFinish(); } catch (Throwable t) { application.abort(t); } finally { cleanup(context); } }