List of usage examples for org.apache.hadoop.util LineReader readLine
public int readLine(Text str) throws IOException
From source file:WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override/*w ww. j a v a 2 s . c o m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the different file splits for the data based on a given number of splits * //from w ww . ja v a 2 s .c o m * @param status * file status * @param conf * hadoop configuration object * @param numberOfSplits * number of splits to split the data in * @return list of file splits * @throws IOException * thrown if the file does not exist */ public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits) throws IOException { List<FileSplit> splits = newArrayList(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); } long totalNumberOfLines = getTotalNumberOfLines(conf, fileName); int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits); LineReader lr = null; FSDataInputStream in = null; try { in = fileName.getFileSystem(conf).open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { splits.add(createFileSplit(fileName, begin, length)); begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(createFileSplit(fileName, begin, length)); } } finally { if (lr != null) { lr.close(); } if (in != null) { in.close(); } } return splits; }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned. * //w ww.ja v a 2s. c o m * @param conf * hadoop configuration object * @param fileName * name of file to count * @return the number of lines in the file * @throws IOException */ public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException { long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1); if (nrLines != -1) { return nrLines; } try { FSDataInputStream in = fileName.getFileSystem(conf).open(fileName); LineReader lr = new LineReader(in, conf); Text text = new Text(); nrLines = 0; while (lr.readLine(text) > 0) { nrLines++; } in.close(); return nrLines; } catch (IOException e) { e.printStackTrace(); } return 0; }
From source file:bixo.examples.crawl.MultiDomainUrlFilter.java
License:Apache License
public MultiDomainUrlFilter(Path filterFile) throws Exception { //we could require a filter file and put these in all urls or leave them here _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$"); _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://"); JobConf conf = HadoopUtils.getDefaultJobConf(); try {//process the file passed in if (filterFile != null) { FileSystem fs = filterFile.getFileSystem(conf); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader lr = new LineReader(in); Text tmpStr = new Text(); while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines String p = tmpStr.toString().trim();//remove whitespace if (p.substring(0, 1).equals("+")) {// '+' means do-crawl ArrayList filterPair = new ArrayList(); filterPair.add((Boolean) true); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } else if (p.substring(0, 1).equals("-")) {// '-' means filter out ArrayList filterPair = new ArrayList(); filterPair.add(new Boolean(false)); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } // otherwise a comment or malformed filter pattern }//from w w w. jav a 2s. c om } } } catch (Exception e) { //any cleanup here? This would indicate a file system error, most likely throw e; } }
From source file:bixo.examples.crawl.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file system JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); LOGGER.info("Looking for file: " + urlFiltersFile); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }//w ww. j a v a2s . c o m } in.close(); } else { LOGGER.info("Can't find file: " + urlFiltersFile); } return filterList; }
From source file:boostingPL.boosting.InstancesHelper.java
License:Open Source License
/** * create instances header from metadata, * the metadata like this:/*from w ww . ja v a 2 s . c om*/ * * <br/> * <p>attributesNum:100</p> * <p>classes:+1,-1</p> * <br/> * * @param in * @return * @throws IOException */ public static Instances createInstancesFromMetadata(LineReader in) throws IOException { int attributesNum = 0; ArrayList<Attribute> attInfo = null; List<String> classItems = null; Text line = new Text(); while (in.readLine(line) > 0) { String sline = line.toString(); if (sline.startsWith("attributesNum:")) { attributesNum = Integer.parseInt(sline.substring(14)); attInfo = new ArrayList<Attribute>(attributesNum + 1); for (int i = 0; i < attributesNum; i++) { attInfo.add(new Attribute("attr" + i)); } System.out.println("AttributeNum:" + attributesNum); } else if (sline.startsWith("classes:")) { String classes = sline.substring(8); String[] citems = classes.split(","); classItems = new ArrayList<String>(citems.length); for (String s : citems) { classItems.add(s); } System.out.println("classes:" + classes); } } attInfo.add(new Attribute("class", classItems)); Instances insts = new Instances("BoostingPL-dataset", attInfo, 0); insts.setClassIndex(insts.numAttributes() - 1); return insts; }
From source file:bucket_sort.NLineInputFormat.java
License:Apache License
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException { List<FileSplit> splits = new ArrayList<FileSplit>(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); }/*from w w w .j a v a 2s .co m*/ FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { FSDataInputStream in = fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { // NLineInputFormat uses LineRecordReader, which always reads // (and consumes) at least one character out of its upper split // boundary. So to make sure that each mapper gets N lines, we // move back the upper split limits of each split // by one character here. if (begin == 0) { splits.add(new FileSplit(fileName, begin, length - 1, new String[] {})); } else { splits.add(new FileSplit(fileName, begin - 1, length, new String[] {})); } begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); } } finally { if (lr != null) { lr.close(); } } return splits; }
From source file:com.finderbots.miner.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file sysytem JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }/*from w w w . j a v a 2 s . com*/ } in.close(); } return filterList; }
From source file:com.knewton.mrtool.io.JsonRecordReaderTest.java
License:Apache License
/** * Tests the line reader in the record reader to see if records can be read correctly from the * beginning of an input stream.//from w w w. java 2 s. c o m * * @throws IOException * @throws InterruptedException */ @Test public void testJsonRecordReader() throws IOException, InterruptedException { JsonRecordReader<Text> rr = new JsonRecordReader<Text>() { @Override protected Class<?> getDataClass(String jsonStr) { return Text.class; } }; Configuration conf = new Configuration(); TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID()); FileSplit fileSplit = new FileSplit(new Path("recs.2013-03-20_02_52.log"), 0, recommendationBytes.length, new String[0]); new MockUp<FileSystem>() { @Mock public FSDataInputStream open(Path f) throws IOException { return new FSDataInputStream(new SeekableByteArrayInputStream(recommendationBytes)); } }; // Initialize it to get the compression codecs rr.initialize(fileSplit, context); // close the line reader and reopen it. rr.close(); LineReader lineReader = rr.initLineReader(fileSplit, conf); Text line = new Text(); lineReader.readLine(line); assertEquals(DummyJsonRecommendations.jsonRecommendations[0], line.toString()); line = new Text(); lineReader.readLine(line); assertEquals(DummyJsonRecommendations.jsonRecommendations[1], line.toString()); lineReader.close(); }
From source file:com.knewton.mrtool.io.JsonRecordReaderTest.java
License:Apache License
/** * Tests the line reader in the record reader to see if records can be read correctly from a * random seek location in the input stream. * // w w w . j ava2s . co m * @throws IOException * @throws InterruptedException */ @Test public void testJsonRecordReaderWithRandomPos() throws IOException, InterruptedException { JsonRecordReader<Text> rr = new JsonRecordReader<Text>() { @Override protected Class<?> getDataClass(String jsonStr) { return Text.class; } }; Configuration conf = new Configuration(); TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID()); FileSplit fileSplit = new FileSplit(new Path("recs.2013-03-20_02_52.log"), 10, recommendationBytes.length, new String[0]); new MockUp<FileSystem>() { @Mock public FSDataInputStream open(Path f) throws IOException { return new FSDataInputStream(new SeekableByteArrayInputStream(recommendationBytes)); } }; // Initialize it to get the compression codecs rr.initialize(fileSplit, context); // close the line reader and reopen it. rr.close(); LineReader lineReader = rr.initLineReader(fileSplit, conf); Text line = new Text(); lineReader.readLine(line); assertEquals(DummyJsonRecommendations.jsonRecommendations[1], line.toString()); line = new Text(); lineReader.readLine(line); assertTrue(line.toString().isEmpty()); lineReader.close(); }