List of usage examples for org.apache.hadoop.util LineReader LineReader
public LineReader(InputStream in)
From source file:WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w ww. ja va 2s. co m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:bixo.examples.crawl.MultiDomainUrlFilter.java
License:Apache License
public MultiDomainUrlFilter(Path filterFile) throws Exception { //we could require a filter file and put these in all urls or leave them here _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$"); _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://"); JobConf conf = HadoopUtils.getDefaultJobConf(); try {//process the file passed in if (filterFile != null) { FileSystem fs = filterFile.getFileSystem(conf); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader lr = new LineReader(in); Text tmpStr = new Text(); while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines String p = tmpStr.toString().trim();//remove whitespace if (p.substring(0, 1).equals("+")) {// '+' means do-crawl ArrayList filterPair = new ArrayList(); filterPair.add((Boolean) true); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } else if (p.substring(0, 1).equals("-")) {// '-' means filter out ArrayList filterPair = new ArrayList(); filterPair.add(new Boolean(false)); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } // otherwise a comment or malformed filter pattern }// w w w . j a va 2s . c o m } } } catch (Exception e) { //any cleanup here? This would indicate a file system error, most likely throw e; } }
From source file:bixo.examples.crawl.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file system JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); LOGGER.info("Looking for file: " + urlFiltersFile); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }//w ww.jav a 2 s . c o m } in.close(); } else { LOGGER.info("Can't find file: " + urlFiltersFile); } return filterList; }
From source file:boostingPL.MR.AdaBoostPLMapper.java
License:Open Source License
/** create instances header */ protected void setup(Context context) throws IOException, InterruptedException { String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close();//from w ww . j a va2 s . c o m dis.close(); }
From source file:boostingPL.MR.AdaBoostPLTestMapper.java
License:Open Source License
protected void setup(Context context) throws IOException, InterruptedException { // classifier file Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000"); String boostingName = context.getConfiguration().get("BoostingPL.boostingName"); boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path); // testing dataset metadata String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close();//from w w w . j a va 2 s . c o m dis.close(); try { eval = new Evaluation(insts); } catch (Exception e) { LOG.error("[BoostingPL-Test]: Evaluation init error!"); e.printStackTrace(); } instanceCounter = context.getCounter("BoostingPL", "Number of instances"); }
From source file:boostingPL.MR.AdaBoostPLTestReducer.java
License:Open Source License
protected void setup(Context context) throws IOException, InterruptedException { // classifier file Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000"); String boostingName = context.getConfiguration().get("BoostingPL.boostingName"); boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path); // testing dataset metadata String pathSrc = context.getConfiguration().get("BoostingPL.metadata"); FileSystem hdfs = FileSystem.get(context.getConfiguration()); FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc))); LineReader in = new LineReader(dis); insts = InstancesHelper.createInstancesFromMetadata(in); in.close();/*from ww w . j a v a2 s.co m*/ dis.close(); try { eval = new Evaluation(insts); } catch (Exception e) { LOG.error("[BoostingPL-Test]: Evaluation init error!"); e.printStackTrace(); } }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system./*from ww w . j a v a 2 s .co m*/ * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *//*from www. j av a 2 s . c o m*/ protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException { Text buffer = new Text(); if (true) { // (start > 0) // use start>0 to assume that files start with valid data // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts with a + // // If this isn't the start of a record, we want to backtrack to its end long backtrackPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { // backtrack to the end of the record we thought was the start. start = backtrackPosition; stream.seek(start); reader = new LineReader(stream); } } } while (bytesRead > 0); stream.seek(start); } pos = start; }
From source file:com.finderbots.miner.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file sysytem JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }// w w w . j a va 2 s . co m } in.close(); } return filterList; }
From source file:com.redsqirl.workflow.server.connect.HDFSInterface.java
License:Open Source License
/** * Read rows from the path provide/* w ww . j a v a 2s . c o m*/ * * @param path * @param delimiter * @param maxToRead * @throws RemoteException */ @Override public List<String> select(String path, String delimiter, int maxToRead) throws RemoteException { Path p = new Path(path); List<String> ans = null; HdfsFileChecker fCh = new HdfsFileChecker(p); try { FileSystem fs = NameNodeVar.getFS(); if (fCh.isDirectory()) { FileStatus[] fsA = fs.listStatus(p); int listSize = Math.min(maxToRead, fsA.length); ans = new ArrayList<String>(listSize); for (int i = 0; i < listSize; ++i) { ans.add(fsA[i].getPath().toString()); } } else if (fCh.isFile()) { InputStream inS = fs.open(p); InputStream in = null; InputStream compressedReader = null; if (path.endsWith(".bz2") || path.endsWith(".bz")) { compressedReader = new BZip2CompressorInputStream(inS); in = compressedReader; } else if (path.endsWith(".gz")) { compressedReader = new GZIPInputStream(inS); in = compressedReader; } else { in = inS; } LineReader reader = new LineReader(in); ans = new ArrayList<String>(maxToRead); Text line = new Text(); int lineNb = 0; while (reader.readLine(line) != 0 && lineNb < maxToRead) { ans.add(line.toString()); ++lineNb; } if (compressedReader != null) { compressedReader.close(); } inS.close(); } // fs.close(); } catch (IOException e) { logger.error("Cannot select the file or directory: " + p); logger.error(e.getMessage(), e); } // fCh.close(); return ans; }