List of usage examples for org.apache.hadoop.io Text Text
public Text()
From source file:boa.datagen.SeqRepoImporter.java
License:Apache License
private static void getProcessedProjects() throws IOException { FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-08")); String hostname = InetAddress.getLocalHost().getHostName(); for (int i = 0; i < files.length; i++) { FileStatus file = files[i];/*w ww . j ava 2 s . c o m*/ String prefix = "projects-" + hostname + "-"; String name = file.getPath().getName(); int index1 = name.indexOf(prefix); if (index1 > -1) { try { SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf); final Text key = new Text(); while (r.next(key)) { processedProjectIds.add(key.toString()); } r.close(); } catch (EOFException e) { printError(e, "EOF Exception in " + file.getPath().getName()); fileSystem.delete(file.getPath(), false); } } } System.out.println("Got processed projects: " + processedProjectIds.size()); }
From source file:boostingPL.boosting.InstancesHelper.java
License:Open Source License
/** * create instances header from metadata, * the metadata like this://from w w w .ja va 2 s . c om * * <br/> * <p>attributesNum:100</p> * <p>classes:+1,-1</p> * <br/> * * @param in * @return * @throws IOException */ public static Instances createInstancesFromMetadata(LineReader in) throws IOException { int attributesNum = 0; ArrayList<Attribute> attInfo = null; List<String> classItems = null; Text line = new Text(); while (in.readLine(line) > 0) { String sline = line.toString(); if (sline.startsWith("attributesNum:")) { attributesNum = Integer.parseInt(sline.substring(14)); attInfo = new ArrayList<Attribute>(attributesNum + 1); for (int i = 0; i < attributesNum; i++) { attInfo.add(new Attribute("attr" + i)); } System.out.println("AttributeNum:" + attributesNum); } else if (sline.startsWith("classes:")) { String classes = sline.substring(8); String[] citems = classes.split(","); classItems = new ArrayList<String>(citems.length); for (String s : citems) { classItems.add(s); } System.out.println("classes:" + classes); } } attInfo.add(new Attribute("class", classItems)); Instances insts = new Instances("BoostingPL-dataset", attInfo, 0); insts.setClassIndex(insts.numAttributes() - 1); return insts; }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Position the input stream at the start of the first record. * * @param stream The stream to reposition. *///from www . j a v a2s .com protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException { Text buffer = new Text(); if (true) { // (start > 0) // use start>0 to assume that files start with valid data // Advance to the start of the first record that ends with /1 // We use a temporary LineReader to read lines until we find the // position of the right one. We then seek the file to that position. stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = 0; do { bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); int bufferLength = buffer.getLength(); if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) { start += bytesRead; } else { // line starts with @. Read two more and verify that it starts with a + // // If this isn't the start of a record, we want to backtrack to its end long backtrackPosition = start + bytesRead; bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') { break; // all good! } else { // backtrack to the end of the record we thought was the start. start = backtrackPosition; stream.seek(start); reader = new LineReader(stream); } } } while (bytesRead > 0); stream.seek(start); } pos = start; }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Seeks ahead in our split to the next key-value pair. * * Triggers the read of an interleaved FASTQ read pair, and populates * internal state./*from ww w.j ava2 s . c o m*/ * * @return True if reading the next read pair succeeded. */ public boolean nextKeyValue() throws IOException, InterruptedException { currentValue = new Text(); return next(currentValue); }
From source file:brush.FastqRecordReader.java
License:Apache License
/** * Reads a newline into a text record from the underlying line reader. * * @param dest Text record to read line into. * @param eofOk Whether an EOF is acceptable in this line. * @return Returns the number of bytes read. * * @throws EOFException Throws if eofOk was false and we hit an EOF in * the current line./*from ww w. j av a 2 s. c o m*/ */ private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException { Text buf = new Text(); int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH); if (bytesRead < 0 || (bytesRead == 0 && !eofOk)) throw new EOFException(); dest.append(buf.getBytes(), 0, buf.getLength()); dest.append(newline, 0, 1); pos += bytesRead; return bytesRead; }
From source file:bucket_sort.NLineInputFormat.java
License:Apache License
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit) throws IOException { List<FileSplit> splits = new ArrayList<FileSplit>(); Path fileName = status.getPath(); if (status.isDir()) { throw new IOException("Not a file: " + fileName); }/*from ww w . j a v a2 s .c o m*/ FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { FSDataInputStream in = fs.open(fileName); lr = new LineReader(in, conf); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == numLinesPerSplit) { // NLineInputFormat uses LineRecordReader, which always reads // (and consumes) at least one character out of its upper split // boundary. So to make sure that each mapper gets N lines, we // move back the upper split limits of each split // by one character here. if (begin == 0) { splits.add(new FileSplit(fileName, begin, length - 1, new String[] {})); } else { splits.add(new FileSplit(fileName, begin - 1, length, new String[] {})); } begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(new FileSplit(fileName, begin, length, new String[] {})); } } finally { if (lr != null) { lr.close(); } } return splits; }
From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianRecordReader.java
License:Open Source License
public CartesianRecordReader(CompositeInputSplit split, TaskAttemptContext taskAttemptContext) throws IOException { this.leftIS = split.get(0); this.rightIS = split.get(1); this.rightTaskAttemptContext = taskAttemptContext; this.key = new Text(); this.value = new Text(); Configuration conf = rightTaskAttemptContext.getConfiguration(); try {/*from w ww.j a va 2 s.c o m*/ // Create left record reader FileInputFormat leftFIF = (FileInputFormat) ReflectionUtils .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf); leftRR = leftFIF.createRecordReader(leftIS, taskAttemptContext); // Create right record reader rightFIF = (FileInputFormat) ReflectionUtils .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf); } catch (ClassNotFoundException e) { e.printStackTrace(); throw new IOException(e); } catch (InterruptedException e) { e.printStackTrace(); throw new IOException(e); } }
From source file:cascading.scheme.hadoop.TextDelimited.java
License:Open Source License
@Override public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { sinkCall.setContext(new Object[3]); sinkCall.getContext()[0] = new Text(); sinkCall.getContext()[1] = new StringBuilder(4 * 1024); sinkCall.getContext()[2] = Charset.forName(charsetName); if (writeHeader) writeHeader(sinkCall);/*from www .j av a 2s . c o m*/ }
From source file:cascading.scheme.hadoop.TextLine.java
License:Open Source License
@Override public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { sinkCall.setContext(new Object[2]); sinkCall.getContext()[0] = new Text(); sinkCall.getContext()[1] = Charset.forName(charsetName); }
From source file:cascading.tap.hadoop.ZipInputFormatTest.java
License:Open Source License
public void testSplits() throws Exception { JobConf job = new JobConf(); FileSystem currentFs = FileSystem.get(job); Path file = new Path(workDir, "test.zip"); Reporter reporter = Reporter.NULL;//w w w. ja v a2 s . c om int seed = new Random().nextInt(); LOG.info("seed = " + seed); Random random = new Random(seed); FileInputFormat.setInputPaths(job, file); for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) { ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream); long length = 0; LOG.debug("creating; zip file with entries = " + entries); // for each entry in the zip file for (int entryCounter = 0; entryCounter < entries; entryCounter++) { // construct zip entries splitting MAX_LENGTH between entries long entryLength = MAX_LENGTH / entries; ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt"); zipEntry.setMethod(ZipEntry.DEFLATED); zos.putNextEntry(zipEntry); for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) { zos.write(Long.toString(length).getBytes()); zos.write("\n".getBytes()); } zos.flush(); zos.closeEntry(); } zos.flush(); zos.close(); currentFs.delete(file, true); OutputStream outputStream = currentFs.create(file); byteArrayOutputStream.writeTo(outputStream); outputStream.close(); ZipInputFormat format = new ZipInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); InputSplit[] splits = format.getSplits(job, 100); BitSet bits = new BitSet((int) length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos()); assertFalse("key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.close(); } } assertEquals("some keys in no partition.", length, bits.cardinality()); } }