List of usage examples for org.apache.hadoop.io Text Text
public Text(byte[] utf8)
From source file:co.nubetech.hiho.similarity.ngram.TestScoreReducer.java
License:Apache License
@Test public void testReducerValidValues() throws IOException, InterruptedException { ArrayList<IntWritable> values = new ArrayList<IntWritable>(); values.add(new IntWritable(1)); values.add(new IntWritable(1)); ValuePair key = new ValuePair(); key.setValue1(new Text("This is a bookdelimiterBetweenKeyAndValuevalue1")); key.setValue2(new Text("This is not a bookdelimiterBetweenKeyAndValuevalue2")); Reducer.Context context = mock(Reducer.Context.class); ScoreReducer scoreReducer = new ScoreReducer(); scoreReducer.reduce(key, values, context); verify(context).write(key, new LongWritable(2)); }
From source file:co.nubetech.hiho.similarity.ngram.TestSimilarityJob.java
License:Apache License
@Test public void testSimilarityJobForValidValues() throws Exception { final String inputData = "This is a book value1\nThis is not a book value2"; createTextFileInHDFS(inputData, "input", "testFile1.txt"); String[] args = new String[] { "-inputPath", "input" }; SimilarityJob.main(args);// w ww. ja v a 2 s .c o m FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "outputOfScoreJob/part-r-00000"); Configuration conf = new Configuration(); SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf); Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); List<ValuePair> expectedOutputForKey = new ArrayList<ValuePair>(); ValuePair valuePair = new ValuePair(); valuePair.setValue1(new Text("This is a bookdelimiterBetweenKeyAndValuevalue1")); valuePair.setValue2(new Text("This is not a bookdelimiterBetweenKeyAndValuevalue2")); expectedOutputForKey.add(valuePair); List<LongWritable> expectedOutputForValue = new ArrayList<LongWritable>(); expectedOutputForValue.add(new LongWritable(2)); int count = 0; while (reader.next(writableKey, writableValue)) { logger.debug("Key and value is: " + writableKey + ", " + writableValue); assertTrue("Matched output " + writableKey, expectedOutputForKey.contains(writableKey)); assertTrue("Matched output " + writableValue, expectedOutputForValue.contains(writableValue)); count++; } IOUtils.closeStream(reader); assertEquals(1, count); }
From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(this.getConf()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCount.WordCountMapper.class); job.setCombinerClass(WordCount.WordCountCombiner.class); job.setReducerClass(WordCount.WordCountReducer.class); // clone the articles table ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]); Connector conn = inst.getConnector(args[2], new PasswordToken(args[3])); conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE, true, Collections.EMPTY_MAP, Collections.EMPTY_SET); // take cloned table offline, waiting until the operation is complete boolean wait = true; conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait); ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]); // input//from ww w. j av a 2 s. com job.setInputFormatClass(AccumuloInputFormat.class); AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE); List<Pair<Text, Text>> columns = new ArrayList<>(); columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text(""))); AccumuloInputFormat.fetchColumns(job, columns); AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); // configure to use underlying RFiles AccumuloInputFormat.setOfflineTableScan(job, true); // output job.setOutputFormatClass(AccumuloOutputFormat.class); BatchWriterConfig bwConfig = new BatchWriterConfig(); AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig); AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE); AccumuloOutputFormat.setCreateTables(job, true); job.setJarByClass(WordCount.class); job.waitForCompletion(true); //job.submit(); return 0; }
From source file:com.accumulobook.advanced.mapreduce.WordCount.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(new Configuration()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountCombiner.class); job.setReducerClass(WordCountReducer.class); // input// w w w.j a v a 2 s . c om job.setInputFormatClass(AccumuloInputFormat.class); ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]); AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE); List<Pair<Text, Text>> columns = new ArrayList<>(); columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text(""))); AccumuloInputFormat.fetchColumns(job, columns); AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); // output job.setOutputFormatClass(AccumuloOutputFormat.class); BatchWriterConfig config = new BatchWriterConfig(); AccumuloOutputFormat.setBatchWriterOptions(job, config); AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig); AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3])); AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE); AccumuloOutputFormat.setCreateTables(job, true); job.setJarByClass(WordCount.class); job.submit(); return 0; }
From source file:com.accumulobook.advanced.mapreduce.WordCountIngester.java
License:Apache License
public WordCountIngester(final Connector conn) throws TableNotFoundException, AccumuloException, AccumuloSecurityException, TableExistsException { this.conn = conn; if (!conn.tableOperations().exists(WikipediaConstants.WORD_COUNT_TABLE)) { conn.tableOperations().create(WikipediaConstants.WORD_COUNT_TABLE); // setup combiner IteratorSetting iterSet = new IteratorSetting(10, "summingCombiner", org.apache.accumulo.core.iterators.user.SummingCombiner.class.getName()); SummingCombiner.setEncodingType(iterSet, SummingCombiner.Type.STRING); List<IteratorSetting.Column> columns = new ArrayList<>(); columns.add(new IteratorSetting.Column(new Text("counts"), new Text(""))); SummingCombiner.setColumns(iterSet, columns); conn.tableOperations().attachIterator(WikipediaConstants.WORD_COUNT_TABLE, iterSet); }// w w w. j ava 2 s . co m BatchWriterConfig config = new BatchWriterConfig(); config.setMaxMemory(1000000L); config.setMaxWriteThreads(10); config.setMaxLatency(10, TimeUnit.SECONDS); batchWriter = conn.createBatchWriter(WikipediaConstants.WORD_COUNT_TABLE, config); closed = false; }
From source file:com.accumulobook.basic.WikipediaClient.java
License:Apache License
/** * * @param columnFamily/*from www .j ava 2 s . c o m*/ * @param columnQualifier * @throws TableNotFoundException */ public void scanColumn(String columnFamily, String columnQualifier) throws TableNotFoundException { Scanner scanner = conn.createScanner(WikipediaConstants.ARTICLES_TABLE, auths); // scan one column from all rows scanner.fetchColumn(new Text(columnFamily), new Text(columnQualifier)); for (Map.Entry<Key, Value> entry : scanner) { Key key = entry.getKey(); String valueString = new String(entry.getValue().get()); System.out.println(key.getRow().toString() + "\t" + valueString); } }
From source file:com.accumulobook.designs.graph.Graph.java
License:Apache License
/** * return a list of neighbors for a given node * * @param node//from w w w . j a v a2 s .co m * @param scanner * @param edgeType * @return */ public static Iterable<String> getNeighbors(final String node, final Scanner scanner, final String edgeType) { scanner.setRange(Range.exact(node)); if (!edgeType.equals("ALL")) { scanner.fetchColumnFamily(new Text(edgeType)); } return Iterables.transform(scanner, new Function<Entry<Key, Value>, String>() { @Override public String apply(Entry<Key, Value> f) { return f.getKey().getColumnQualifier().toString(); } }); }
From source file:com.accumulobook.designs.graph.Graph.java
License:Apache License
/** * /*w w w. j ava 2 s . c om*/ * @param neighbors * @param batchScanner * @param edgeType * @return */ public static Iterable<String> neighborsOfNeighbors(final Iterable<String> neighbors, final BatchScanner batchScanner, final String edgeType) { List<Iterable<String>> nextNeighbors = new ArrayList<>(); // process given neighbors in batches of 100 for (List<String> batch : Iterables.partition(neighbors, 100)) { batchScanner.setRanges(Lists.transform(batch, new Function<String, Range>() { @Override public Range apply(String f) { return Range.exact(f); } })); if (!edgeType.equals("ALL")) batchScanner.fetchColumnFamily(new Text(edgeType)); nextNeighbors.add(Iterables.transform(batchScanner, new Function<Entry<Key, Value>, String>() { @Override public String apply(Entry<Key, Value> f) { return f.getKey().getColumnQualifier().toString(); } })); } return Sets.newHashSet(Iterables.concat(nextNeighbors)); }
From source file:com.accumulobook.designs.multitermindex.WikipediaQueryMultiterm.java
License:Apache License
public void printResults(String[] terms) throws TableNotFoundException { BatchScanner scanner = conn.createBatchScanner(WikipediaConstants.DOC_PARTITIONED_TABLE, auths, 10); scanner.setTimeout(1, TimeUnit.MINUTES); scanner.setRanges(Collections.singleton(new Range())); Text[] termTexts = new Text[terms.length]; for (int i = 0; i < terms.length; i++) { termTexts[i] = new Text(terms[i]); }//from ww w .j av a 2 s .c o m // lookup all articles containing the terms IteratorSetting is = new IteratorSetting(50, IndexedDocIterator.class); IndexedDocIterator.setColfs(is, "ind", "doc"); IndexedDocIterator.setColumnFamilies(is, termTexts); scanner.addScanIterator(is); for (Entry<Key, Value> entry : scanner) { String[] parts = entry.getKey().getColumnQualifier().toString().split("\0"); System.out.println("doctype: " + parts[0] + "\ndocID:" + parts[1] + "\ninfo: " + parts[2] + "\n\ntext: " + entry.getValue().toString()); } }
From source file:com.accumulobook.designs.rdf.FreebaseExample.java
License:Apache License
public static Iterable<String> query(final String subject, final String predicate, final String object, final Scanner scanner) throws Exception { if (subject == null && predicate == null && object == null) throw new IllegalArgumentException("Must specify at least one of subject, predicate, object"); if (subject != null) { if (predicate != null) { // SP_ get all objects for subject and predicate scanner.setRange(Range.prefix("spo_" + subject)); scanner.fetchColumnFamily(new Text(predicate)); return Iterables.transform(scanner, cqSelector); } else {/*www . j a va 2 s. c o m*/ if (object != null) { // S_O get all predicates for subject and object scanner.setRange(Range.prefix("osp_" + object)); scanner.fetchColumnFamily(new Text(predicate)); return Iterables.transform(scanner, cqSelector); } else { // S__ get all predicates and objects for subject scanner.setRange(Range.prefix("spo_" + subject)); return Iterables.transform(scanner, cfqSelector); } } } else { if (predicate != null) { if (object != null) { // _PO get all subjects for predicate and object scanner.setRange(Range.prefix("pos_" + predicate)); scanner.fetchColumnFamily(new Text(object)); return Iterables.transform(scanner, cqSelector); } else { // _P_ get all subjects and objects for predicate scanner.setRange(Range.prefix("pos_" + predicate)); return Iterables.transform(scanner, cfqSelector); } } else { // __O get all subjects and predicates for object scanner.setRange(Range.prefix("osp_" + object)); return Iterables.transform(scanner, cfqSelector); } } }