Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline.java

License:Apache License

public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) {
    // Read back the commits to make sure
    Path archiveLogPath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath());
    try (SequenceFile.Reader reader = new SequenceFile.Reader(metaClient.getHadoopConf(),
            SequenceFile.Reader.file(archiveLogPath))) {
        Text key = new Text();
        Text val = new Text();
        while (reader.next(key, val)) {
            // TODO - limit the number of commits loaded in memory. this could get very large.
            // This is okay because only tooling will load the archived commit timeline today
            readCommits.put(key.toString(), Arrays.copyOf(val.getBytes(), val.getLength()));
        }/*www .j  a  v  a 2  s .c  om*/
        this.setInstants(readCommits.keySet().stream()
                .map(s -> new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, s))
                .collect(Collectors.toList()));
    } catch (IOException e) {
        throw new HoodieIOException("Could not load archived commit timeline from path " + archiveLogPath, e);
    }
    // multiple casts will make this lambda serializable -
    // http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16
    this.details = (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails;
    this.metaClient = metaClient;
}

From source file:com.yahoo.glimmer.indexing.generator.DocumentMapper.java

License:Open Source License

@Override
public void map(LongWritable key, Text record, Context context) throws IOException, InterruptedException {
    doc.setContent(record.getBytes(), record.getLength());

    if (doc == null || doc.getSubject() == null) {
        // Failed parsing
        context.getCounter(Counters.FAILED_PARSING).increment(1);
        LOG.error("Document failed parsing");
        return;//w  ww .j a va2 s.  c o m
    }

    if (doc.getId() < 0) {
        throw new IllegalStateException("Negative docId:" + doc.getId() + " subject:" + doc.getSubject());
    }

    // This is used to write the position of the last occurrence and testing
    // if the fakeDocOccurrrence for the term has already been written.
    Map<String, DocStat> termToDocStatMap = new HashMap<String, DocStat>();

    // Iterate over all indices
    for (int indexId = 0; indexId < fields.length; indexId++) {
        String fieldName = fields[indexId];
        if (fieldName.startsWith("NOINDEX")) {
            continue;
        }

        TermValue indexIdValue = new TermValue(Type.INDEX_ID, indexId);

        // Iterate in parallel over the words of the indices
        MutableString term = new MutableString("");
        MutableString nonWord = new MutableString("");
        WordReader termReader = doc.content(indexId);
        int position = 0;

        while (termReader.next(term, nonWord)) {
            // Read next property as well
            if (term != null && term.length() > 0) {
                String termString = term.toString();

                // Report progress
                context.setStatus(fields[indexId] + "=" + term.substring(0, Math.min(term.length(), 50)));

                // Create an occurrence at the next position
                TermValue occurrenceValue = new TermValue(Type.OCCURRENCE, doc.getId(), position);
                context.write(new TermKey(termString, indexId, occurrenceValue), occurrenceValue);

                DocStat docStat = termToDocStatMap.get(termString);
                if (docStat == null) {
                    if (doc.getIndexType() == RDFDocumentFactory.IndexType.VERTICAL) {
                        // For the Alignment Index, we write the predicate
                        // id(Which is equal to the index id for a VERTICAL
                        // index) the first time we encounter a term.
                        // The 'Alignment Index' is an index without counts
                        // or positions. It's used for query optimization in
                        // the query parser. The resulting 'alignment index'
                        // is basically used as a map from term to
                        // predicates that the term occurs in.
                        context.write(new TermKey(termString, ALIGNMENT_INDEX, indexIdValue), indexIdValue);
                    }
                    docStat = new DocStat();
                    docStat.last = position;
                    docStat.count = 1;
                    termToDocStatMap.put(termString, docStat);
                } else {
                    docStat.last = position;
                    docStat.count++;
                }

                position++;
                context.getCounter(Counters.INDEXED_OCCURRENCES).increment(1);
            } else {
                LOG.info("Nextterm is null");
            }
        }

        if (doc.getIndexType() == RDFDocumentFactory.IndexType.HORIZONTAL && position > 0) {
            TermValue docSizeValue = new TermValue(Type.DOC_SIZE, doc.getId(), position);
            context.write(new TermKey(TermKey.DOC_SIZE_TERM, indexId, docSizeValue), docSizeValue);
        }

        for (String termString : termToDocStatMap.keySet()) {
            DocStat docStat = termToDocStatMap.get(termString);
            TermValue occurrenceCountValue = new TermValue(Type.TERM_STATS, docStat.count, docStat.last);
            context.write(new TermKey(termString, indexId, occurrenceCountValue), occurrenceCountValue);
        }
        termToDocStatMap.clear();
    }

    context.getCounter(Counters.NUMBER_OF_RECORDS).increment(1);
}

From source file:com.yahoo.glimmer.indexing.preprocessor.ResourcesReducer.java

License:Open Source License

protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Object>.Context context)
        throws IOException, InterruptedException {
    context.getCounter(Counters.KEYS).increment(1);
    int keyPredicateCount = 0;
    int keyObjectCount = 0;
    int keyContextCount = 0;
    int relationsCount = 0;
    int duplicateRelatations = 0;

    outputCount.output = OUTPUT.ALL;//from   ww  w. j  a  va 2 s .c  o m
    outputCount.count = 0;
    context.write(key, outputCount);

    bySubjectRecord.clearRelations();

    String lastValue = null;

    for (Text value : values) {
        context.getCounter(Counters.VALUES).increment(1);
        if (PREDICATE_TEXT.equals(value)) {
            keyPredicateCount++;
        } else if (OBJECT_TEXT.equals(value)) {
            keyObjectCount++;
        } else if (CONTEXT_TEXT.equals(value)) {
            keyContextCount++;
        } else if (SUBJECT_TEXT.equals(value)) {
            throw new IllegalArgumentException(
                    "Reducer got a SUBJECT value!?.  Should only be \"PREDICATE\", \"OBJECT\", \"CONTEXT\" or a relation String.");
        } else if (value.getLength() > 0) {
            String valueString = value.toString();
            if (!valueString.equals(lastValue)) {
                bySubjectRecord.addRelation(valueString);
                relationsCount++;
                lastValue = valueString;
            } else {
                duplicateRelatations++;
            }
        }
    }

    if (relationsCount > 0) {
        if (duplicateRelatations > 0) {
            context.getCounter(Counters.DUPLICATE_RELATIONS).increment(duplicateRelatations);
        }

        // The docId's should match with OUTPUT.ALL hash values
        bySubjectRecord.setId(docId);
        bySubjectRecord.setSubject(key.toString());

        if (bySubjectRecord.getRelationsCount() != relationsCount) {
            System.out.println("Too many relations. Only indexing " + bySubjectRecord.getRelationsCount()
                    + " of " + relationsCount + ". Subject is:" + key.toString());
            context.getCounter(Counters.TOO_MANY_RELATIONS).increment(1);
        }
        context.write(key, bySubjectRecord);

        bySubjectRecord.setPreviousId(docId);
        context.getCounter(Counters.KEY_SUBJECT).increment(relationsCount);
    }

    if (keyPredicateCount > 0) {
        outputCount.output = OUTPUT.PREDICATE;
        outputCount.count = keyPredicateCount;
        context.write(key, outputCount);
        context.getCounter(Counters.KEY_PREDICATE).increment(keyPredicateCount);
    }
    if (keyObjectCount > 0) {
        outputCount.output = OUTPUT.OBJECT;
        outputCount.count = keyObjectCount;
        context.write(key, outputCount);
        context.getCounter(Counters.KEY_OBJECT).increment(keyObjectCount);
    }
    if (keyContextCount > 0) {
        outputCount.output = OUTPUT.CONTEXT;
        outputCount.count = keyContextCount;
        context.write(key, outputCount);
        context.getCounter(Counters.KEY_CONTEXT).increment(keyContextCount);
    }

    docId++;
}

From source file:core.client.impl.ConditionalWriterImpl.java

License:Apache License

private boolean isVisible(ByteSequence cv) {
    Text testVis = new Text(cv.toArray());
    if (testVis.getLength() == 0)
        return true;

    Boolean b = (Boolean) cache.get(testVis);
    if (b != null)
        return b;

    try {// ww w  . j a v  a 2 s .co  m
        Boolean bb = ve.evaluate(new ColumnVisibility(testVis));
        cache.put(new Text(testVis), bb);
        return bb;
    } catch (VisibilityParseException e) {
        return false;
    } catch (BadArgumentException e) {
        return false;
    }
}

From source file:core.data.ConditionalMutation.java

License:Apache License

public ConditionalMutation(Text row) {
    this(row.getBytes(), 0, row.getLength());
}

From source file:cosmos.impl.IndexToMultimapRecord.java

License:Apache License

@Override
public MultimapRecord apply(Entry<Key, Value> input) {
    Key k = input.getKey();//from   w  ww .j  av  a2 s  .  c om

    Text colqual = k.getColumnQualifier();

    int index = colqual.find(Defaults.NULL_BYTE_STR);
    if (-1 == index) {
        throw new RuntimeException("Was provided unexpected Key: " + k);
    }

    int start = index + 1;
    try {
        String docId = Text.decode(colqual.getBytes(), start, colqual.getLength() - start);

        return sorts.contents(id, docId);

    } catch (TableNotFoundException e) {
        throw new RuntimeException(e);
    } catch (UnexpectedStateException e) {
        throw new RuntimeException(e);
    } catch (CharacterCodingException e) {
        throw new RuntimeException(e);
    }
}

From source file:cosmos.mapred.MediawikiMapper.java

License:Apache License

/**
 * Called once for each key/value pair in the input split. Most applications should override this, but the default is the identity function.
 *///from ww  w  . java  2 s  .c om
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    Object o;
    try {
        o = unmarshaller.unmarshal(new ByteArrayInputStream(value.getBytes(), 0, value.getLength()));
    } catch (JAXBException e) {
        throw new IOException("Couldn't unmarshall '" + value + "'", e);
    }

    PageType pageType = (PageType) o;

    Page page = pageTypeToPage(pageType);

    Value protobufValue = new Value(page.toByteArray());

    Mutation m = new Mutation(Long.toString(page.getId()));
    m.put(empty, empty, protobufValue);

    context.write(tableName, m);
}

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) {
        Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");

        ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
        int cp;/*from w w  w  . ja va 2  s  .c  o m*/
        while (buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
            System.out.println(Integer.toHexString(cp));
        }
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void text() {

        Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
        assertThat(t.getLength(), is(10));

        assertThat(t.find("\u0041"), is(0));
        assertThat(t.find("\u00DF"), is(1));
        assertThat(t.find("\u6771"), is(3));
        assertThat(t.find("\uD801\uDC00"), is(6));

        assertThat(t.charAt(0), is(0x0041));
        assertThat(t.charAt(1), is(0x00DF));
        assertThat(t.charAt(3), is(0x6771));
        assertThat(t.charAt(6), is(0x10400));
    }/*from   ww w.  java2 s . com*/

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void test() throws IOException {
        // vv TextTest
        Text t = new Text("hadoop");
        assertThat(t.getLength(), is(6));
        assertThat(t.getBytes().length, is(6));

        assertThat(t.charAt(2), is((int) 'd'));
        assertThat("Out of bounds", t.charAt(100), is(-1));
        // ^^ TextTest
    }/*  w  w  w. j  av a  2s . c  om*/