Example usage for org.apache.hadoop.io Text Text

List of usage examples for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text(byte[] utf8) 

Source Link

Document

Construct from a byte array.

Usage

From source file:co.nubetech.hiho.similarity.ngram.TestScoreReducer.java

License:Apache License

@Test
public void testReducerValidValues() throws IOException, InterruptedException {
    ArrayList<IntWritable> values = new ArrayList<IntWritable>();
    values.add(new IntWritable(1));
    values.add(new IntWritable(1));
    ValuePair key = new ValuePair();
    key.setValue1(new Text("This is a bookdelimiterBetweenKeyAndValuevalue1"));
    key.setValue2(new Text("This is not a bookdelimiterBetweenKeyAndValuevalue2"));
    Reducer.Context context = mock(Reducer.Context.class);
    ScoreReducer scoreReducer = new ScoreReducer();
    scoreReducer.reduce(key, values, context);
    verify(context).write(key, new LongWritable(2));
}

From source file:co.nubetech.hiho.similarity.ngram.TestSimilarityJob.java

License:Apache License

@Test
public void testSimilarityJobForValidValues() throws Exception {
    final String inputData = "This is a book   value1\nThis is not a book   value2";
    createTextFileInHDFS(inputData, "input", "testFile1.txt");
    String[] args = new String[] { "-inputPath", "input" };
    SimilarityJob.main(args);//  w ww. ja v  a  2  s .c  o m

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "outputOfScoreJob/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

    List<ValuePair> expectedOutputForKey = new ArrayList<ValuePair>();
    ValuePair valuePair = new ValuePair();
    valuePair.setValue1(new Text("This is a bookdelimiterBetweenKeyAndValuevalue1"));
    valuePair.setValue2(new Text("This is not a bookdelimiterBetweenKeyAndValuevalue2"));
    expectedOutputForKey.add(valuePair);

    List<LongWritable> expectedOutputForValue = new ArrayList<LongWritable>();
    expectedOutputForValue.add(new LongWritable(2));

    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("Key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutputForKey.contains(writableKey));
        assertTrue("Matched output " + writableValue, expectedOutputForValue.contains(writableValue));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(1, count);
}

From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(this.getConf());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCount.WordCountMapper.class);
    job.setCombinerClass(WordCount.WordCountCombiner.class);
    job.setReducerClass(WordCount.WordCountReducer.class);

    // clone the articles table
    ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]);
    Connector conn = inst.getConnector(args[2], new PasswordToken(args[3]));

    conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE,
            true, Collections.EMPTY_MAP, Collections.EMPTY_SET);

    // take cloned table offline, waiting until the operation is complete
    boolean wait = true;
    conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    // input//from ww w. j  av  a  2  s. com
    job.setInputFormatClass(AccumuloInputFormat.class);
    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // configure to use underlying RFiles
    AccumuloInputFormat.setOfflineTableScan(job, true);

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.waitForCompletion(true);
    //job.submit();

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(new Configuration());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    // input// w  w w.j a v  a  2  s  .  c  om
    job.setInputFormatClass(AccumuloInputFormat.class);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig config = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, config);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.submit();
    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.WordCountIngester.java

License:Apache License

public WordCountIngester(final Connector conn)
        throws TableNotFoundException, AccumuloException, AccumuloSecurityException, TableExistsException {
    this.conn = conn;

    if (!conn.tableOperations().exists(WikipediaConstants.WORD_COUNT_TABLE)) {
        conn.tableOperations().create(WikipediaConstants.WORD_COUNT_TABLE);

        // setup combiner
        IteratorSetting iterSet = new IteratorSetting(10, "summingCombiner",
                org.apache.accumulo.core.iterators.user.SummingCombiner.class.getName());

        SummingCombiner.setEncodingType(iterSet, SummingCombiner.Type.STRING);

        List<IteratorSetting.Column> columns = new ArrayList<>();
        columns.add(new IteratorSetting.Column(new Text("counts"), new Text("")));

        SummingCombiner.setColumns(iterSet, columns);
        conn.tableOperations().attachIterator(WikipediaConstants.WORD_COUNT_TABLE, iterSet);
    }//  w  w  w. j  ava  2 s  . co  m

    BatchWriterConfig config = new BatchWriterConfig();
    config.setMaxMemory(1000000L);
    config.setMaxWriteThreads(10);
    config.setMaxLatency(10, TimeUnit.SECONDS);

    batchWriter = conn.createBatchWriter(WikipediaConstants.WORD_COUNT_TABLE, config);
    closed = false;
}

From source file:com.accumulobook.basic.WikipediaClient.java

License:Apache License

/**
 *
 * @param columnFamily/*from  www .j ava 2 s . c o  m*/
 * @param columnQualifier
 * @throws TableNotFoundException
 */
public void scanColumn(String columnFamily, String columnQualifier) throws TableNotFoundException {

    Scanner scanner = conn.createScanner(WikipediaConstants.ARTICLES_TABLE, auths);

    // scan one column from all rows
    scanner.fetchColumn(new Text(columnFamily), new Text(columnQualifier));

    for (Map.Entry<Key, Value> entry : scanner) {
        Key key = entry.getKey();

        String valueString = new String(entry.getValue().get());
        System.out.println(key.getRow().toString() + "\t" + valueString);
    }
}

From source file:com.accumulobook.designs.graph.Graph.java

License:Apache License

/**
 * return a list of neighbors for a given node
 *
 * @param node//from   w w w  .  j a v a2 s  .co m
 * @param scanner
 * @param edgeType
 * @return
 */
public static Iterable<String> getNeighbors(final String node, final Scanner scanner, final String edgeType) {

    scanner.setRange(Range.exact(node));

    if (!edgeType.equals("ALL")) {
        scanner.fetchColumnFamily(new Text(edgeType));
    }

    return Iterables.transform(scanner, new Function<Entry<Key, Value>, String>() {
        @Override
        public String apply(Entry<Key, Value> f) {
            return f.getKey().getColumnQualifier().toString();
        }
    });
}

From source file:com.accumulobook.designs.graph.Graph.java

License:Apache License

/**
 * /*w w  w. j ava 2 s .  c om*/
 * @param neighbors
 * @param batchScanner
 * @param edgeType
 * @return 
 */
public static Iterable<String> neighborsOfNeighbors(final Iterable<String> neighbors,
        final BatchScanner batchScanner, final String edgeType) {

    List<Iterable<String>> nextNeighbors = new ArrayList<>();

    // process given neighbors in batches of 100
    for (List<String> batch : Iterables.partition(neighbors, 100)) {
        batchScanner.setRanges(Lists.transform(batch, new Function<String, Range>() {
            @Override
            public Range apply(String f) {
                return Range.exact(f);
            }
        }));

        if (!edgeType.equals("ALL"))
            batchScanner.fetchColumnFamily(new Text(edgeType));

        nextNeighbors.add(Iterables.transform(batchScanner, new Function<Entry<Key, Value>, String>() {
            @Override
            public String apply(Entry<Key, Value> f) {
                return f.getKey().getColumnQualifier().toString();
            }
        }));
    }

    return Sets.newHashSet(Iterables.concat(nextNeighbors));
}

From source file:com.accumulobook.designs.multitermindex.WikipediaQueryMultiterm.java

License:Apache License

public void printResults(String[] terms) throws TableNotFoundException {

    BatchScanner scanner = conn.createBatchScanner(WikipediaConstants.DOC_PARTITIONED_TABLE, auths, 10);
    scanner.setTimeout(1, TimeUnit.MINUTES);
    scanner.setRanges(Collections.singleton(new Range()));

    Text[] termTexts = new Text[terms.length];
    for (int i = 0; i < terms.length; i++) {
        termTexts[i] = new Text(terms[i]);
    }//from ww w .j  av a 2  s  .c  o m

    // lookup all articles containing the terms
    IteratorSetting is = new IteratorSetting(50, IndexedDocIterator.class);
    IndexedDocIterator.setColfs(is, "ind", "doc");
    IndexedDocIterator.setColumnFamilies(is, termTexts);
    scanner.addScanIterator(is);

    for (Entry<Key, Value> entry : scanner) {
        String[] parts = entry.getKey().getColumnQualifier().toString().split("\0");
        System.out.println("doctype: " + parts[0] + "\ndocID:" + parts[1] + "\ninfo: " + parts[2] + "\n\ntext: "
                + entry.getValue().toString());
    }
}

From source file:com.accumulobook.designs.rdf.FreebaseExample.java

License:Apache License

public static Iterable<String> query(final String subject, final String predicate, final String object,
        final Scanner scanner) throws Exception {

    if (subject == null && predicate == null && object == null)
        throw new IllegalArgumentException("Must specify at least one of subject, predicate, object");

    if (subject != null) {
        if (predicate != null) { // SP_ get all objects for subject and predicate
            scanner.setRange(Range.prefix("spo_" + subject));
            scanner.fetchColumnFamily(new Text(predicate));

            return Iterables.transform(scanner, cqSelector);
        } else {/*www .  j  a va  2 s. c  o m*/
            if (object != null) { // S_O get all predicates for subject and object

                scanner.setRange(Range.prefix("osp_" + object));
                scanner.fetchColumnFamily(new Text(predicate));

                return Iterables.transform(scanner, cqSelector);
            } else { // S__ get all predicates and objects for subject
                scanner.setRange(Range.prefix("spo_" + subject));
                return Iterables.transform(scanner, cfqSelector);
            }
        }
    } else {
        if (predicate != null) {
            if (object != null) { // _PO get all subjects for predicate and object
                scanner.setRange(Range.prefix("pos_" + predicate));
                scanner.fetchColumnFamily(new Text(object));

                return Iterables.transform(scanner, cqSelector);
            } else { // _P_ get all subjects and objects for predicate
                scanner.setRange(Range.prefix("pos_" + predicate));
                return Iterables.transform(scanner, cfqSelector);
            }
        } else { // __O get all subjects and predicates for object
            scanner.setRange(Range.prefix("osp_" + object));
            return Iterables.transform(scanner, cfqSelector);
        }
    }
}