List of usage examples for org.apache.hadoop.util LineReader close
public void close() throws IOException
From source file:WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from w w w . j a v a2 s . co m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the different file splits for the data based on a given number of splits * /* w w w. j av a 2s. c o m*/ * @param status * file status * @param conf * hadoop configuration object * @param numberOfSplits * number of splits to split the data in * @return list of file splits * @throws IOException * thrown if the file does not exist */ public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits) throws IOException { List<FileSplit> splits = newArrayList(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); } long totalNumberOfLines = getTotalNumberOfLines(conf, fileName); int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits); LineReader lr = null; FSDataInputStream in = null; try { in = fileName.getFileSystem(conf).open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { splits.add(createFileSplit(fileName, begin, length)); begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(createFileSplit(fileName, begin, length)); } } finally { if (lr != null) { lr.close(); } if (in != null) { in.close(); } } return splits; }
From source file:boostingPL.MR.AdaBoostPLMapper.java
License:Open Source License
/** create instances header */ protected void setup(Context context) throws IOException, InterruptedException { String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close(); dis.close();/*from www . j a va2 s.com*/ }
From source file:boostingPL.MR.AdaBoostPLTestMapper.java
License:Open Source License
protected void setup(Context context) throws IOException, InterruptedException { // classifier file Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000"); String boostingName = context.getConfiguration().get("BoostingPL.boostingName"); boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path); // testing dataset metadata String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close(); dis.close();/* ww w . ja va 2s.c om*/ try { eval = new Evaluation(insts); } catch (Exception e) { LOG.error("[BoostingPL-Test]: Evaluation init error!"); e.printStackTrace(); } instanceCounter = context.getCounter("BoostingPL", "Number of instances"); }
From source file:boostingPL.MR.AdaBoostPLTestReducer.java
License:Open Source License
protected void setup(Context context) throws IOException, InterruptedException { // classifier file Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000"); String boostingName = context.getConfiguration().get("BoostingPL.boostingName"); boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path); // testing dataset metadata String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close(); dis.close();//w w w .j a va 2s.c o m try { eval = new Evaluation(insts); } catch (Exception e) { LOG.error("[BoostingPL-Test]: Evaluation init error!"); e.printStackTrace(); } }
From source file:bucket_sort.NLineInputFormat.java
License:Apache License
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException { List<FileSplit> splits = new ArrayList<FileSplit>(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); }//from ww w . j ava 2 s . com FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { FSDataInputStream in = fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { // NLineInputFormat uses LineRecordReader, which always reads // (and consumes) at least one character out of its upper split // boundary. So to make sure that each mapper gets N lines, we // move back the upper split limits of each split // by one character here. if (begin == 0) { splits.add(new FileSplit(fileName, begin, length - 1, new String[] {})); } else { splits.add(new FileSplit(fileName, begin - 1, length, new String[] {})); } begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); } } finally { if (lr != null) { lr.close(); } } return splits; }
From source file:com.knewton.mrtool.io.JsonRecordReaderTest.java
License:Apache License
/** * Tests the line reader in the record reader to see if records can be read correctly from the * beginning of an input stream.//from w w w .j av a 2s . co m * * @throws IOException * @throws InterruptedException */ @Test public void testJsonRecordReader() throws IOException, InterruptedException { JsonRecordReader<Text> rr = new JsonRecordReader<Text>() { @Override protected Class<?> getDataClass(String jsonStr) { return Text.class; } }; Configuration conf = new Configuration(); TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID()); FileSplit fileSplit = new FileSplit(new Path("recs.2013-03-20_02_52.log"), 0, recommendationBytes.length, new String[0]); new MockUp<FileSystem>() { @Mock public FSDataInputStream open(Path f) throws IOException { return new FSDataInputStream(new SeekableByteArrayInputStream(recommendationBytes)); } }; // Initialize it to get the compression codecs rr.initialize(fileSplit, context); // close the line reader and reopen it. rr.close(); LineReader lineReader = rr.initLineReader(fileSplit, conf); Text line = new Text(); lineReader.readLine(line); assertEquals(DummyJsonRecommendations.jsonRecommendations[0], line.toString()); line = new Text(); lineReader.readLine(line); assertEquals(DummyJsonRecommendations.jsonRecommendations[1], line.toString()); lineReader.close(); }
From source file:com.knewton.mrtool.io.JsonRecordReaderTest.java
License:Apache License
/** * Tests the line reader in the record reader to see if records can be read correctly from a * random seek location in the input stream. * /*from w w w. ja va 2 s.c o m*/ * @throws IOException * @throws InterruptedException */ @Test public void testJsonRecordReaderWithRandomPos() throws IOException, InterruptedException { JsonRecordReader<Text> rr = new JsonRecordReader<Text>() { @Override protected Class<?> getDataClass(String jsonStr) { return Text.class; } }; Configuration conf = new Configuration(); TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID()); FileSplit fileSplit = new FileSplit(new Path("recs.2013-03-20_02_52.log"), 10, recommendationBytes.length, new String[0]); new MockUp<FileSystem>() { @Mock public FSDataInputStream open(Path f) throws IOException { return new FSDataInputStream(new SeekableByteArrayInputStream(recommendationBytes)); } }; // Initialize it to get the compression codecs rr.initialize(fileSplit, context); // close the line reader and reopen it. rr.close(); LineReader lineReader = rr.initLineReader(fileSplit, conf); Text line = new Text(); lineReader.readLine(line); assertEquals(DummyJsonRecommendations.jsonRecommendations[1], line.toString()); line = new Text(); lineReader.readLine(line); assertTrue(line.toString().isEmpty()); lineReader.close(); }
From source file:com.ricemap.spateDB.operations.FileMBR.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing/*from ww w . j av a2 s .c o m*/ * @param conf * @param fs * @param file * @return * @throws IOException */ public static <S extends Shape> Prism fileMBRMapReduce(FileSystem fs, Path file, S stockShape, boolean background) throws IOException { // Quickly get file MBR if it is globally indexed GlobalIndex<Partition> globalIndex = SpatialSite.getGlobalIndex(fs, file); if (globalIndex != null) { // Return the MBR of the global index. // Compute file size by adding up sizes of all files assuming they are // not compressed long totalLength = 0; for (Partition p : globalIndex) { Path filePath = new Path(file, p.filename); if (fs.exists(filePath)) totalLength += fs.getFileStatus(filePath).getLen(); } sizeOfLastProcessedFile = totalLength; return globalIndex.getMBR(); } JobConf job = new JobConf(FileMBR.class); Path outputPath; FileSystem outFs = FileSystem.get(job); do { outputPath = new Path(file.toUri().getPath() + ".mbr_" + (int) (Math.random() * 1000000)); } while (outFs.exists(outputPath)); job.setJobName("FileMBR"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Prism.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setInputFormat(ShapeInputFormat.class); SpatialSite.setShapeClass(job, stockShape.getClass()); job.setOutputFormat(TextOutputFormat.class); ShapeInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); job.setOutputCommitter(MBROutputCommitter.class); // Submit the job if (background) { JobClient jc = new JobClient(job); lastSubmittedJob = jc.submitJob(job); return null; } else { lastSubmittedJob = JobClient.runJob(job); Counters counters = lastSubmittedJob.getCounters(); Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES); FileMBR.sizeOfLastProcessedFile = inputBytesCounter.getValue(); // Read job result FileStatus[] results = outFs.listStatus(outputPath); Prism mbr = new Prism(); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { mbr.fromText(text); } lineReader.close(); } } outFs.delete(outputPath, true); return mbr; } }
From source file:com.ricemap.spateDB.operations.RecordCount.java
License:Apache License
/** * Counts the exact number of lines in a file by issuing a MapReduce job * that does the thing/*from www.ja v a 2 s. c om*/ * @param conf * @param fs * @param file * @return * @throws IOException */ public static long recordCountMapReduce(FileSystem fs, Path file) throws IOException { JobConf job = new JobConf(RecordCount.class); Path outputPath = new Path(file.toUri().getPath() + ".linecount"); FileSystem outFs = outputPath.getFileSystem(job); outFs.delete(outputPath, true); job.setJobName("LineCount"); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5); job.setNumReduceTasks(1); job.setInputFormat(ShapeLineInputFormat.class); job.setOutputFormat(TextOutputFormat.class); ShapeLineInputFormat.setInputPaths(job, file); TextOutputFormat.setOutputPath(job, outputPath); // Submit the job JobClient.runJob(job); // Read job result long lineCount = 0; FileStatus[] results = outFs.listStatus(outputPath); for (FileStatus fileStatus : results) { if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) { LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath())); Text text = new Text(); if (lineReader.readLine(text) > 0) { lineCount = Long.parseLong(text.toString()); } lineReader.close(); } } outFs.delete(outputPath, true); return lineCount; }