Example usage for org.apache.hadoop.io Text Text

List of usage examples for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text() 

Source Link

Usage

From source file:boa.datagen.SeqRepoImporter.java

License:Apache License

private static void getProcessedProjects() throws IOException {
    FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-08"));
    String hostname = InetAddress.getLocalHost().getHostName();
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];/*w  ww  .  j  ava  2 s .  c o m*/
        String prefix = "projects-" + hostname + "-";
        String name = file.getPath().getName();
        int index1 = name.indexOf(prefix);
        if (index1 > -1) {
            try {
                SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
                final Text key = new Text();
                while (r.next(key)) {
                    processedProjectIds.add(key.toString());
                }
                r.close();
            } catch (EOFException e) {
                printError(e, "EOF Exception in " + file.getPath().getName());
                fileSystem.delete(file.getPath(), false);
            }
        }
    }
    System.out.println("Got processed projects: " + processedProjectIds.size());
}

From source file:boostingPL.boosting.InstancesHelper.java

License:Open Source License

/**
 * create instances header from metadata,
 * the metadata like this://from   w w  w  .ja  va 2 s  .  c  om
 * 
 *   <br/>
 *   <p>attributesNum:100</p>
 *   <p>classes:+1,-1</p>
 *   <br/>
 * 
 * @param in
 * @return
 * @throws IOException
 */
public static Instances createInstancesFromMetadata(LineReader in) throws IOException {
    int attributesNum = 0;
    ArrayList<Attribute> attInfo = null;
    List<String> classItems = null;

    Text line = new Text();
    while (in.readLine(line) > 0) {
        String sline = line.toString();
        if (sline.startsWith("attributesNum:")) {
            attributesNum = Integer.parseInt(sline.substring(14));
            attInfo = new ArrayList<Attribute>(attributesNum + 1);
            for (int i = 0; i < attributesNum; i++) {
                attInfo.add(new Attribute("attr" + i));
            }

            System.out.println("AttributeNum:" + attributesNum);
        } else if (sline.startsWith("classes:")) {
            String classes = sline.substring(8);
            String[] citems = classes.split(",");
            classItems = new ArrayList<String>(citems.length);
            for (String s : citems) {
                classItems.add(s);
            }

            System.out.println("classes:" + classes);
        }
    }

    attInfo.add(new Attribute("class", classItems));
    Instances insts = new Instances("BoostingPL-dataset", attInfo, 0);
    insts.setClassIndex(insts.numAttributes() - 1);

    return insts;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *///from   www .  j a  v  a2s .com
protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
    Text buffer = new Text();

    if (true) { // (start > 0) // use start>0 to assume that files start with valid data
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        LineReader reader = new LineReader(stream);

        int bytesRead = 0;
        do {
            bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
            int bufferLength = buffer.getLength();
            if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
                start += bytesRead;
            } else {
                // line starts with @.  Read two more and verify that it starts with a +
                //
                // If this isn't the start of a record, we want to backtrack to its end
                long backtrackPosition = start + bytesRead;

                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                    break; // all good!
                } else {
                    // backtrack to the end of the record we thought was the start.
                    start = backtrackPosition;
                    stream.seek(start);
                    reader = new LineReader(stream);
                }
            }
        } while (bytesRead > 0);

        stream.seek(start);
    }

    pos = start;
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Seeks ahead in our split to the next key-value pair.
 *
 * Triggers the read of an interleaved FASTQ read pair, and populates
 * internal state./*from  ww  w.j ava2 s .  c  o m*/
 *
 * @return True if reading the next read pair succeeded.
 */
public boolean nextKeyValue() throws IOException, InterruptedException {
    currentValue = new Text();

    return next(currentValue);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Reads a newline into a text record from the underlying line reader.
 *
 * @param dest Text record to read line into.
 * @param eofOk Whether an EOF is acceptable in this line.
 * @return Returns the number of bytes read.
 *
 * @throws EOFException Throws if eofOk was false and we hit an EOF in
 *    the current line./*from ww  w.  j av a  2 s. c o  m*/
 */
private int appendLineInto(final Text dest, final boolean eofOk) throws EOFException, IOException {
    Text buf = new Text();
    int bytesRead = lineReader.readLine(buf, MAX_LINE_LENGTH);

    if (bytesRead < 0 || (bytesRead == 0 && !eofOk))
        throw new EOFException();

    dest.append(buf.getBytes(), 0, buf.getLength());
    dest.append(newline, 0, 1);
    pos += bytesRead;

    return bytesRead;
}

From source file:bucket_sort.NLineInputFormat.java

License:Apache License

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit)
        throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }/*from  ww  w  . j a v  a2  s .c  o m*/
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                // NLineInputFormat uses LineRecordReader, which always reads
                // (and consumes) at least one character out of its upper split
                // boundary. So to make sure that each mapper gets N lines, we
                // move back the upper split limits of each split 
                // by one character here.
                if (begin == 0) {
                    splits.add(new FileSplit(fileName, begin, length - 1, new String[] {}));
                } else {
                    splits.add(new FileSplit(fileName, begin - 1, length, new String[] {}));
                }
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(new FileSplit(fileName, begin, length, new String[] {}));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}

From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianRecordReader.java

License:Open Source License

public CartesianRecordReader(CompositeInputSplit split, TaskAttemptContext taskAttemptContext)
        throws IOException {

    this.leftIS = split.get(0);
    this.rightIS = split.get(1);
    this.rightTaskAttemptContext = taskAttemptContext;
    this.key = new Text();
    this.value = new Text();
    Configuration conf = rightTaskAttemptContext.getConfiguration();
    try {/*from  w ww.j  a  va  2 s.c o  m*/
        // Create left record reader
        FileInputFormat leftFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf);

        leftRR = leftFIF.createRecordReader(leftIS, taskAttemptContext);

        // Create right record reader
        rightFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf);
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
        throw new IOException(e);
    } catch (InterruptedException e) {
        e.printStackTrace();
        throw new IOException(e);
    }
}

From source file:cascading.scheme.hadoop.TextDelimited.java

License:Open Source License

@Override
public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
        SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
    sinkCall.setContext(new Object[3]);

    sinkCall.getContext()[0] = new Text();
    sinkCall.getContext()[1] = new StringBuilder(4 * 1024);
    sinkCall.getContext()[2] = Charset.forName(charsetName);

    if (writeHeader)
        writeHeader(sinkCall);/*from  www  .j av a  2s  . c o  m*/
}

From source file:cascading.scheme.hadoop.TextLine.java

License:Open Source License

@Override
public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
        SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
    sinkCall.setContext(new Object[2]);

    sinkCall.getContext()[0] = new Text();
    sinkCall.getContext()[1] = Charset.forName(charsetName);
}

From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;//w  w  w.  ja  v  a2 s . c om

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
            zipEntry.setMethod(ZipEntry.DEFLATED);
            zos.putNextEntry(zipEntry);

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                zos.write(Long.toString(length).getBytes());
                zos.write("\n".getBytes());
            }

            zos.flush();
            zos.closeEntry();
        }

        zos.flush();
        zos.close();

        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);

        byteArrayOutputStream.writeTo(outputStream);
        outputStream.close();

        ZipInputFormat format = new ZipInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));
                    bits.set(v);
                    count++;
                }

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {
                reader.close();
            }
        }

        assertEquals("some keys in no partition.", length, bits.cardinality());
    }
}