List of usage examples for org.apache.hadoop.io Text Text
public Text()
From source file:authordetect.input.SingleBookReader.java
/** * @param inputSplit//from w w w. j av a 2s . com * @param context the information about the task * @throws java.io.IOException * @throws InterruptedException */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration configuration = context.getConfiguration(); // get the option from configuration: // 0 for group by author, 1 for group by book int option = configuration.getInt("GROUP_OPTION", 0); Path path = split.getPath(); filename = path.getName(); FileSystem fileSystem = path.getFileSystem(configuration); FSDataInputStream inputStream = fileSystem.open(path); lineReader = new LineReader(inputStream, configuration); //initial start point and end point start = split.getStart(); end = start + split.getLength(); inputStream.seek(start); if (start != 0) { start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } start += lineReader.readLine(currentLine); prepareToScanBook(option); }
From source file:be.uantwerpen.adrem.disteclat.ItemReaderReducer.java
License:Apache License
/** * Writes the singletons distribution to file OSingletonsDistribution. The distribution is obtained using Round-Robin * allocation./*from w w w .ja va 2s. c o m*/ * * @param sortedSingletons * the sorted list of singletons * @throws IOException * @throws InterruptedException */ private void writeSingletonsDistribution(List<Integer> sortedSingletons) throws IOException, InterruptedException { int end = Math.min(numberOfMappers, sortedSingletons.size()); Text mapperId = new Text(); Text assignedItems = new Text(); // Round robin assignment for (int ix = 0; ix < end; ix++) { StringBuilder sb = new StringBuilder(); for (int ix1 = ix; ix1 < sortedSingletons.size(); ix1 += numberOfMappers) { sb.append(sortedSingletons.get(ix1)).append(" "); } mapperId.set("" + ix); assignedItems.set(sb.substring(0, sb.length() - 1)); mos.write(OSingletonsDistribution, mapperId, assignedItems); } }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the different file splits for the data based on a given number of splits * /* www. j av a 2s . c om*/ * @param status * file status * @param conf * hadoop configuration object * @param numberOfSplits * number of splits to split the data in * @return list of file splits * @throws IOException * thrown if the file does not exist */ public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits) throws IOException { List<FileSplit> splits = newArrayList(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); } long totalNumberOfLines = getTotalNumberOfLines(conf, fileName); int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits); LineReader lr = null; FSDataInputStream in = null; try { in = fileName.getFileSystem(conf).open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { splits.add(createFileSplit(fileName, begin, length)); begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(createFileSplit(fileName, begin, length)); } } finally { if (lr != null) { lr.close(); } if (in != null) { in.close(); } } return splits; }
From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java
License:Apache License
/** * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned. * //from ww w. j a va 2 s . c o m * @param conf * hadoop configuration object * @param fileName * name of file to count * @return the number of lines in the file * @throws IOException */ public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException { long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1); if (nrLines != -1) { return nrLines; } try { FSDataInputStream in = fileName.getFileSystem(conf).open(fileName); LineReader lr = new LineReader(in, conf); Text text = new Text(); nrLines = 0; while (lr.readLine(text) > 0) { nrLines++; } in.close(); return nrLines; } catch (IOException e) { e.printStackTrace(); } return 0; }
From source file:be.ugent.intec.halvade.tools.AlignerInstance.java
License:Open Source License
protected AlignerInstance(Mapper.Context context, String bin) throws IOException, URISyntaxException { AlignerInstance.context = context;// ww w. j a va 2 s. com header = null; containers = HalvadeConf.getMapContainerCount(context.getConfiguration()); tasksLeft = HalvadeConf.getMapTasksLeft(context.getConfiguration()); redistribute = HalvadeConf.getRedistribute(context.getConfiguration()); writableRecord = new SAMRecordWritable(); writableRegion = new ChromosomeRegion(); writeableCompactRegion = new GenomeSJ(); stub = new Text(); minChrLength = HalvadeConf.getMinChrLength(context.getConfiguration()); chr = HalvadeConf.getChrList(context.getConfiguration()); tmpdir = HalvadeConf.getScratchTempDir(context.getConfiguration()); if (!tmpdir.endsWith("/")) tmpdir = tmpdir + "/"; File tmp = new File(tmpdir); tmp.mkdirs(); this.bin = bin; threads = HalvadeConf.getMapThreads(context.getConfiguration()); isPaired = HalvadeConf.getIsPaired(context.getConfiguration()); Logger.DEBUG("paired? " + isPaired); splitter = new ChromosomeSplitter(HalvadeConf.getBedRegions(context.getConfiguration()), context.getConfiguration()); keepChrSplitPairs = HalvadeConf.getkeepChrSplitPairs(context.getConfiguration()); keep = HalvadeConf.getKeepFiles(context.getConfiguration()); }
From source file:be.ugent.intec.halvade.tools.STARInstance.java
License:Open Source License
private void emitJSFile(String starOutDir, Mapper.Context context) throws InterruptedException { SAMSequenceDictionary dict = null;// www .jav a 2 s . c o m try { dict = HalvadeConf.getSequenceDictionary(context.getConfiguration()); } catch (IOException ex) { Logger.EXCEPTION(ex); throw new InterruptedException("Error getting the SAMSequenceDictionary for SJ processing"); } BufferedReader br = null; val = new Text(); sj = new GenomeSJ(); try { br = new BufferedReader(new FileReader(starOutDir + "/SJ.out.tab")); String line = br.readLine(); sj.parseSJString(line, dict); while (line != null) { val.set(line); context.write(sj, val); line = br.readLine(); } } catch (IOException | InterruptedException ex) { Logger.EXCEPTION(ex); } finally { if (br != null) { try { br.close(); } catch (IOException ex) { Logger.EXCEPTION(ex); } } } }
From source file:bigsatgps.BigDataHandler.java
License:Open Source License
/** * * @param infile//from w w w .ja v a2s . c o m * @return * @throws Exception */ public String ImageToSequence(String infile) throws Exception { String log4jConfPath = "lib/log4j.properties"; PropertyConfigurator.configure(log4jConfPath); confHadoop = new Configuration(); confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml")); confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml")); FileSystem fs = FileSystem.get(confHadoop); Path inPath = new Path(infile); String outfile = infile.substring(0, infile.indexOf(".")) + ".seq"; Path outPath = new Path(outfile); System.out.println(); System.out.println("Successfully created the sequencefile " + outfile); FSDataInputStream in = null; Text key = new Text(); BytesWritable value = new BytesWritable(); SequenceFile.Writer writer = null; try { in = fs.open(inPath); byte buffer[] = new byte[in.available()]; in.read(buffer); writer = SequenceFile.createWriter(fs, confHadoop, outPath, key.getClass(), value.getClass()); writer.append(new Text(inPath.getName()), new BytesWritable(buffer)); IOUtils.closeStream(writer); return outfile; } catch (IOException e) { System.err.println("Exception MESSAGES = " + e.getMessage()); IOUtils.closeStream(writer); return null; } }
From source file:bixo.examples.crawl.MultiDomainUrlFilter.java
License:Apache License
public MultiDomainUrlFilter(Path filterFile) throws Exception { //we could require a filter file and put these in all urls or leave them here _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$"); _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://"); JobConf conf = HadoopUtils.getDefaultJobConf(); try {//process the file passed in if (filterFile != null) { FileSystem fs = filterFile.getFileSystem(conf); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader lr = new LineReader(in); Text tmpStr = new Text(); while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines String p = tmpStr.toString().trim();//remove whitespace if (p.substring(0, 1).equals("+")) {// '+' means do-crawl ArrayList filterPair = new ArrayList(); filterPair.add((Boolean) true); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } else if (p.substring(0, 1).equals("-")) {// '-' means filter out ArrayList filterPair = new ArrayList(); filterPair.add(new Boolean(false)); filterPair.add(Pattern.compile(p.substring(1, p.length()))); _filters.add(filterPair); } // otherwise a comment or malformed filter pattern }//ww w . j a v a 2 s.c om } } } catch (Exception e) { //any cleanup here? This would indicate a file system error, most likely throw e; } }
From source file:bixo.examples.crawl.RegexUrlFilter.java
License:Apache License
public static List<String> getUrlFilterPatterns(String urlFiltersFile) throws IOException, InterruptedException { //this reads regex filters from a file in HDFS or the native file system JobConf conf = HadoopUtils.getDefaultJobConf(); Path filterFile = new Path(urlFiltersFile); FileSystem fs = filterFile.getFileSystem(conf); List<String> filterList = new ArrayList<String>(); LOGGER.info("Looking for file: " + urlFiltersFile); if (fs.exists(filterFile)) { FSDataInputStream in = fs.open(filterFile); LineReader reader = new LineReader(in); Text tLine = new Text(); while (reader.readLine(tLine) > 0) { String line = tLine.toString(); if (StringUtils.isNotBlank(line) && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) { filterList.add(line.trim()); }//from w ww.j a v a 2 s. com } in.close(); } else { LOGGER.info("Can't find file: " + urlFiltersFile); } return filterList; }
From source file:boa.datagen.SeqProjectCombiner.java
License:Apache License
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); conf.set("fs.default.name", "hdfs://boa-njt/"); FileSystem fileSystem = FileSystem.get(conf); String base = conf.get("fs.default.name", ""); HashMap<String, String> sources = new HashMap<String, String>(); HashSet<String> marks = new HashSet<String>(); FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-07")); for (int i = 0; i < files.length; i++) { FileStatus file = files[i];/*from w ww.j ava 2 s. c o m*/ String name = file.getPath().getName(); if (name.startsWith("projects-") && name.endsWith(".seq")) { System.out.println("Reading file " + i + " in " + files.length + ": " + name); SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf); final Text key = new Text(); final BytesWritable value = new BytesWritable(); try { while (r.next(key, value)) { String s = key.toString(); if (marks.contains(s)) continue; Project p = Project .parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength())); if (p.getCodeRepositoriesCount() > 0 && p.getCodeRepositories(0).getRevisionsCount() > 0) marks.add(s); sources.put(s, name); } } catch (Exception e) { System.err.println(name); e.printStackTrace(); } r.close(); } } SequenceFile.Writer w = SequenceFile.createWriter(fileSystem, conf, new Path(base + "repcache/2015-07/projects.seq"), Text.class, BytesWritable.class); for (int i = 0; i < files.length; i++) { FileStatus file = files[i]; String name = file.getPath().getName(); if (name.startsWith("projects-") && name.endsWith(".seq")) { System.out.println("Reading file " + i + " in " + files.length + ": " + name); SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf); final Text key = new Text(); final BytesWritable value = new BytesWritable(); try { while (r.next(key, value)) { String s = key.toString(); if (sources.get(s).equals(name)) w.append(key, value); } } catch (Exception e) { System.err.println(name); e.printStackTrace(); } r.close(); } } w.close(); fileSystem.close(); }