List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.hdfs.concat.crush.CrushReducer.java
License:Apache License
@Override public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException { String bucket = bucketId.toString(); String dirName = bucket.substring(0, bucket.lastIndexOf('-')); int idx = findMatcher(dirName); String outputFileName = calculateOutputFile(idx, dirName); /*//from w w w . ja va 2 s .c om * Don't need to separate the paths because the output file name is already absolute. */ valueOut.set(outDirPath + outputFileName); LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName)); /* * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir. */ RecordWriter<Object, Object> sink = null; Exception rootCause = null; Object key = null; Object value = null; try { while (null == rootCause && values.hasNext()) { Text srcFile = values.next(); Path inputPath = new Path(srcFile.toString()); RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter); try { if (null == key) { key = reader.createKey(); value = reader.createValue(); /* * Set the key and value class in the conf, which the output format uses to get type information. */ job.setOutputKeyClass(key.getClass()); job.setOutputValueClass(value.getClass()); /* * Output file name is absolute so we can just add it to the crush prefix. */ sink = createRecordWriter(idx, "crush" + outputFileName); } else { Class<?> other = reader.createKey().getClass(); if (!(key.getClass().equals(other))) { throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s", inputPath, key.getClass(), other)); } other = reader.createValue().getClass(); if (!value.getClass().equals(other)) { throw new IllegalArgumentException( format("Heterogeneous values detected in %s: %s !- %s", inputPath, value.getClass(), other)); } } while (reader.next(key, value)) { sink.write(key, value); reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1); } } catch (Exception e) { rootCause = e; } finally { try { reader.close(); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.debug("Swallowing exception on close of " + inputPath, e); } } } /* * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir. */ collector.collect(srcFile, valueOut); reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1); recordNumber++; if (reportRecordNumber == recordNumber) { reportRecordNumber += reportRecordNumber; reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath)); } } } catch (Exception e) { rootCause = e; } finally { if (null != sink) { try { sink.close(reporter); } catch (Exception e) { if (null == rootCause) { rootCause = e; } else { LOG.error("Swallowing exception on close of " + outputFileName, e); } } } /* * Let the exception bubble up with a minimum of wrapping. */ if (null != rootCause) { if (rootCause instanceof RuntimeException) { throw (RuntimeException) rootCause; } if (rootCause instanceof IOException) { throw (IOException) rootCause; } throw new RuntimeException(rootCause); } } }
From source file:com.hdfs.concat.crush.CrushStandAloneSequenceFileTest.java
License:Apache License
private void verifyFile(File dir, String fileName, int key, int count) throws IOException { File file = new File(dir, fileName); Reader reader = new Reader(FileSystem.get(job), new Path(file.getAbsolutePath()), job); int i = 0;/* ww w . j av a 2 s. c o m*/ int actual = 0; Text text = new Text(); IntWritable value = new IntWritable(); while (reader.next(text, value)) { assertThat(text.toString(), equalTo(Integer.toString(key))); assertThat(value.get(), equalTo(i)); if (i == 9) { i = 0; } else { i++; } actual++; } reader.close(); assertThat(actual, equalTo(count)); }
From source file:com.hdfs.concat.crush.CrushTest.java
License:Apache License
@Test public void bucketing() throws Exception { File in = tmp.newFolder("in"); Counters expectedCounters = new Counters(); List<String> expectedBucketFiles = new ArrayList<String>(); /*//from w w w . j a va 2s .co m * Create a hierarchy of directories. Directories are distinguished by a trailing slash in these comments. * * 1/ * 1.1/ * file1 10 bytes * file2 20 bytes * file3 30 bytes * file4 41 bytes * file5 15 bytes * file6 30 bytes * file7 20 bytes * 1.2/ * file1 20 bytes * file2 10 bytes * 1.3/ * 2/ * file1 70 bytes * file2 30 bytes * file3 25 bytes * file4 30 bytes * file5 35 bytes * 2.1/ * file1 10 bytes * 2.2/ * file1 25 bytes * file2 15 bytes * file3 35 bytes * 2.3/ * file1 41 bytes * file2 10 bytes * 2.4/ * 2.4.1/ * file1 100 bytes * file2 30 bytes * 2.4.2/ * file1 20 bytes * file2 20 bytes * file3 10 bytes */ /* * in contains 2 dirs and no files so it is skipped. * * in/ * 1/ * 2/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); tmp.newFolder("in/1"); File dir2 = tmp.newFolder("in/2"); /* * in/1 contains three dirs and no files so it is skipped. * * in/ * 1/ * 1.1/ * 1.2/ * 1.3/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); File dir1_1 = tmp.newFolder("in/1/1.1"); File dir1_2 = tmp.newFolder("in/1/1.2"); tmp.newFolder("in/1/1.3"); /* * in/2 contains five files and four dirs. * * in/ * 2/ * file1 70 bytes * file2 30 bytes * file3 25 bytes * file4 30 bytes * file5 35 bytes * 2.1/ * 2.2/ * 2.3/ * 2.4/ * * 0 1 2 * file5 35 file2 30 file4 30 * file3 25 * * Buckets 0 and 2 have a single file each so they are ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 5); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 3); File dir2_1 = tmp.newFolder("in/2/2.1"); File dir2_2 = tmp.newFolder("in/2/2.2"); File dir2_3 = tmp.newFolder("in/2/2.3"); tmp.newFolder("in/2/2.4"); createFile(dir2, "file1", 70); createFile(dir2, "file2", 30); createFile(dir2, "file3", 25); createFile(dir2, "file4", 30); createFile(dir2, "file5", 35); expectedBucketFiles .add(format("%s %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file2").getAbsolutePath())); expectedBucketFiles .add(format("%s %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file3").getAbsolutePath())); /* * in/1/1.1 contains seven files and no dirs. * * in/ * 1/ * 1.1/ * file1 10 bytes * file2 20 bytes * file3 30 bytes * file4 41 bytes * file5 15 bytes * file6 30 bytes * file7 20 bytes * * 0 1 2 * file3 30 file6 30 file2 20 * file5 15 file1 10 file7 20 * * file4 is > 50 * 0.8 so it is ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 7); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 6); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); createFile(dir1_1, "file1", 10); createFile(dir1_1, "file2", 20); createFile(dir1_1, "file3", 30); createFile(dir1_1, "file4", 41); createFile(dir1_1, "file5", 15); createFile(dir1_1, "file6", 30); createFile(dir1_1, "file7", 20); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file3").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file5").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file6").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file1").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file2").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file7").getAbsolutePath())); /* * in/1/1.2 contains to files. * * in/ * 1/ * 1.2/ * file1 20 bytes * file2 10 bytes */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); createFile(dir1_2, "file1", 20); createFile(dir1_2, "file2", 10); expectedBucketFiles.add( format("%s %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file1").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file2").getAbsolutePath())); /* * in/1/1.3 is empty. * * in/ * 1/ * 1.3/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); tmp.newFolder("in/1/1.3"); /* * in/2/2.1 contains on file. * * in/ * 2/ * 2.1/ * file1 10 bytes * * Single file dirs are ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 1); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); createFile(dir2_1, "file1", 10); /* * in/2/2.2 contains three files. * * in/ * 2/ * 2.2/ * file1 25 bytes * file2 15 bytes * file3 35 bytes * * 0 1 * file3 35 file1 25 * file2 15 * * Bucket 0 with a single file is ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1); createFile(dir2_2, "file1", 25); createFile(dir2_2, "file2", 15); createFile(dir2_2, "file3", 35); expectedBucketFiles.add( format("%s %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file1").getAbsolutePath())); expectedBucketFiles.add( format("%s %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file2").getAbsolutePath())); /* * in/2/2.3 contains 2 files. * * in/ * 2/ * 2.3/ * file1 41 bytes * file2 10 bytes * * file1 is too big and leaving file2 as a single file, which is also ignored. */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2); createFile(dir2_3, "file1", 41); createFile(dir2_3, "file2", 10); /* * in/2/2.4 contains two sub directories and no files. * * in/ * 2/ * 2.4/ * 2.4.1/ * 2.4.2/ */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); tmp.newFolder("in/2/2.4"); File dir2_4_1 = tmp.newFolder("in/2/2.4/2.4.1"); File dir2_4_2 = tmp.newFolder("in/2/2.4/2.4.2"); /* * in/ * 2/ * 2.4/ * 2.4.1/ * file1 100 bytes * file2 30 bytes */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2); expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2); createFile(dir2_4_1, "file1", 100); createFile(dir2_4_1, "file2", 30); /* * in/ * 2/ * 2.4/ * 2.4.2/ * file1 20 bytes * file2 20 bytes * file3 10 bytes * 0 * file1 20 * file2 20 * file3 10 */ expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1); expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1); expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3); expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 3); createFile(dir2_4_2, "file1", 20); createFile(dir2_4_2, "file2", 20); createFile(dir2_4_2, "file3", 10); expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file1").getAbsolutePath())); expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file2").getAbsolutePath())); expectedBucketFiles.add(format("%s %s", dir2_4_2.getAbsolutePath() + "-0", new File(dir2_4_2, "file3").getAbsolutePath())); Crush crush = new Crush(); crush.setConf(job); crush.setFileSystem(fileSystem); /* * Call these in the same order that run() does. */ crush.createJobConfAndParseArgs("--compress=none", "--max-file-blocks=1", in.getAbsolutePath(), new File(tmp.getRoot(), "out").getAbsolutePath(), "20101124171730"); crush.writeDirs(); /* * Verify bucket contents. */ List<String> actualBucketFiles = new ArrayList<String>(); Text key = new Text(); Text value = new Text(); Reader reader = new Reader(FileSystem.get(job), crush.getBucketFiles(), job); while (reader.next(key, value)) { actualBucketFiles.add(format("%s\t%s", key, value)); } reader.close(); Collections.sort(expectedBucketFiles); Collections.sort(actualBucketFiles); assertThat(actualBucketFiles, equalTo(expectedBucketFiles)); /* * Verify the partition map. */ Reader partitionMapReader = new Reader(FileSystem.get(job), crush.getPartitionMap(), job); IntWritable partNum = new IntWritable(); Map<String, Integer> actualPartitions = new HashMap<String, Integer>(); while (partitionMapReader.next(key, partNum)) { actualPartitions.put(key.toString(), partNum.get()); } partitionMapReader.close(); /* * These crush files need to allocated into 5 partitions: * * in/2-1 55 bytes * in/1/1.1-0 45 bytes * in/1/1.1-2 40 bytes * in/1/1.1-1 40 bytes * in/1/1.2-0 30 bytes * in/2/2.2-1 40 bytes * in/2/2.4/2.4.2-0 50 bytes * * 0 1 2 3 4 * in/2-1 55 in/2/2.4/2.4.2-0 50 in/1/1.1-0 45 in/1/1.1-2 40 in/1/1.1-1 40 * in/2/2.2-1 40 in/1/1.2-0 39 */ Map<String, Integer> expectedPartitions = new HashMap<String, Integer>(); //TODO: this may not be deterministic due to jvm/hashmap/filesystem expectedPartitions.put(dir2.getAbsolutePath() + "-1", 0); expectedPartitions.put(dir2_4_2.getAbsolutePath() + "-0", 1); expectedPartitions.put(dir1_1.getAbsolutePath() + "-0", 2); expectedPartitions.put(dir1_1.getAbsolutePath() + "-2", 4); expectedPartitions.put(dir2_2.getAbsolutePath() + "-1", 3); expectedPartitions.put(dir1_1.getAbsolutePath() + "-1", 3); expectedPartitions.put(dir1_2.getAbsolutePath() + "-0", 4); assertThat(actualPartitions, equalTo(expectedPartitions)); /* * Verify counters. */ Counters actualCounters = new Counters(); DataInputStream countersStream = FileSystem.get(job).open(crush.getCounters()); actualCounters.readFields(countersStream); countersStream.close(); assertThat(actualCounters, equalTo(expectedCounters)); }
From source file:com.hhscyber.nl.tweets.hbase2.Hbase2Reducer.java
@Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { try {//from ww w .ja v a2 s. c o m for (Text value : values) { parseJSON(value.toString()); } for (JsonTweet tw : tweets) { Put p = new Put(Bytes.toBytes(tw.getId())); if (tw.getUser().getId() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("id"), Bytes.toBytes(tw.getUser().getId())); } if (tw.getUser().getLocation() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("location"), Bytes.toBytes(tw.getUser().getLocation())); } if (tw.getUser().getDefault_profile() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("default_profile"), Bytes.toBytes(tw.getUser().getDefault_profile())); } if (tw.getUser().getProfile_background_tile() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("profile_backround_tile"), Bytes.toBytes(tw.getUser().getProfile_background_tile())); } if (tw.getUser().getStatuses_count() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("statuses_count"), Bytes.toBytes(tw.getUser().getStatuses_count())); } if (tw.getUser().getLang() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("lang"), Bytes.toBytes(tw.getUser().getLang())); } if (tw.getUser().getFollowing() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("following"), Bytes.toBytes(tw.getUser().getFollowing())); } if (tw.getUser().getProtected() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("protected"), Bytes.toBytes(tw.getUser().getProtected())); } if (tw.getUser().getFavourites_count() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("favorites_count"), Bytes.toBytes(tw.getUser().getFavourites_count())); } if (tw.getUser().getDescription() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("description"), Bytes.toBytes(tw.getUser().getDescription())); } if (tw.getUser().getVerified() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("verified"), Bytes.toBytes(tw.getUser().getVerified())); } if (tw.getUser().getContributors_enabled() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("contributors_enabled"), Bytes.toBytes(tw.getUser().getContributors_enabled())); } if (tw.getUser().getName() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("name"), Bytes.toBytes(tw.getUser().getName())); } if (tw.getUser().getCreated_at() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("created_at"), Bytes.toBytes(tw.getUser().getCreated_at())); } if (tw.getUser().getIs_translation_enabled() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("is_translation_enabled"), Bytes.toBytes(tw.getUser().getIs_translation_enabled())); } if (tw.getUser().getDefault_profile_image() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("default_profile_image"), Bytes.toBytes(tw.getUser().getDefault_profile_image())); } if (tw.getUser().getFollowers_count() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("followers_count"), Bytes.toBytes(tw.getUser().getFollowers_count())); } if (tw.getUser().getHas_extended_profile() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("has_extended_profile"), Bytes.toBytes(tw.getUser().getHas_extended_profile())); } if (tw.getUser().getProfile_image_url_https() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("profile_image_url_https"), Bytes.toBytes(tw.getUser().getProfile_image_url_https())); } if (tw.getUser().getGeo_enabled() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("geo_enabled"), Bytes.toBytes(tw.getUser().getGeo_enabled())); } if (tw.getUser().getProfile_background_image_url_https() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("profile_background_image_url_https"), Bytes.toBytes(tw.getUser().getProfile_background_image_url_https())); } if (tw.getUser().getUrl() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("url"), Bytes.toBytes(tw.getUser().getUrl())); } if (tw.getUser().getUtc_offset() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("utc_offset"), Bytes.toBytes(tw.getUser().getUtc_offset())); } if (tw.getUser().getTime_zone() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("time_zone"), Bytes.toBytes(tw.getUser().getTime_zone())); } if (tw.getUser().getNotifications() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("notifications"), Bytes.toBytes(tw.getUser().getNotifications())); } if (tw.getUser().getFriends_count() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("friends_count"), Bytes.toBytes(tw.getUser().getFriends_count())); } if (tw.getUser().getScreen_name() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("screen_name"), Bytes.toBytes(tw.getUser().getScreen_name())); } if (tw.getUser().getListed_count() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("listed_count"), Bytes.toBytes(tw.getUser().getListed_count())); } if (tw.getUser().getIs_translator() != null) { p.add(Bytes.toBytes("profile"), Bytes.toBytes("is_translator"), Bytes.toBytes(tw.getUser().getIs_translator())); } if (tw.getText() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("text"), Bytes.toBytes(tw.getText())); } if (tw.getKeyword() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("keyword"), Bytes.toBytes(tw.getKeyword())); } if (tw.getUrls() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("urls"), Bytes.toBytes(tw.getUrls())); } if (tw.getFavoriteCount() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("favorite_count"), Bytes.toBytes(tw.getFavoriteCount())); } if (tw.getRetweetCount() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("retweet_count"), Bytes.toBytes(tw.getRetweetCount())); } if (tw.getContributors() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("contributors"), Bytes.toBytes(tw.getContributors())); } if (tw.getCoordinates() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("coordinates"), Bytes.toBytes(tw.getCoordinates())); } if (tw.getCreatedAt() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("created_at"), Bytes.toBytes(tw.getCreatedAt())); } if (tw.getFavorited() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("favorited"), Bytes.toBytes(tw.getFavorited())); } if (tw.getGeo() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("geo"), Bytes.toBytes(tw.getGeo())); } if (tw.getTruncated() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("truncated"), Bytes.toBytes(tw.getTruncated())); } if (tw.getPlace() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("place"), Bytes.toBytes(tw.getPlace())); } if (tw.getSource() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("source"), Bytes.toBytes(tw.getSource())); } if (tw.getLang() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("lang"), Bytes.toBytes(tw.getLang())); } if (tw.getRetweeted() != null) { p.add(Bytes.toBytes("content"), Bytes.toBytes("retweeted"), Bytes.toBytes(tw.getRetweeted())); } context.write(null, p); } } catch (IOException | InterruptedException ex) { Logger.getLogger(ConcatTweetsReducer.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:com.hhscyber.nl.tweets.hbasefill.HbaseFillMapper.java
@Override public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException { String line = val.toString(); new JsonParse().openFileTest(line); }
From source file:com.hhscyber.nl.tweets.processtweets.ProcessTweetsMapper.java
@Override public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException { String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase()); while (itr.hasMoreTokens()) { word.set(itr.nextToken());//from w ww . j a va 2s .c o m context.write(word, one); } }
From source file:com.hhscyber.nl.tweets.svm.features.SetReducer.java
@Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { System.out.println(key.toString() + " Counter " + (counter + 1)); String idx = Integer.toString(++counter); if (isValid(key.toString())) { String skey = key.toString().toLowerCase(); stemmer.setCurrent(skey);/* ww w . java2 s. c o m*/ stemmer.stem(); Put put = new Put(Bytes.toBytes(idx)); put.add(Bytes.toBytes("word"), Bytes.toBytes("index"), Bytes.toBytes(stemmer.getCurrent())); context.write(null, put); } }
From source file:com.hhscyber.nl.tweets.svm.train.TrainMapper.java
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String filePathString = ((FileSplit) context.getInputSplit()).getPath().toString(); String[] tokens = filePathString.split("\\.(?=[^\\.]+$)"); System.out.println(filePathString); Text label;/*w w w . ja v a 2s. c om*/ String line; if (tokens[1].equals("bad")) { label = new Text("bad"); line = "1"; } else { label = new Text("good"); line = "-1"; } Set<Integer> hsline = new HashSet<>(); StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { String tok = itr.nextToken().toLowerCase(); stemmer.setCurrent(tok); stemmer.stem(); String idx = (String) words.get(stemmer.getCurrent()); if (idx != null) { System.out.println(tok + " IDX " + idx); hsline.add(Integer.parseInt(idx)); } } List<Integer> asline = new ArrayList<>(); asline.addAll(hsline); Collections.sort(asline); for (Integer index : asline) { line += " " + index + ":1"; } context.write(label, new Text(line)); }
From source file:com.hortonworks.mapreduce.URLCountM.java
License:Apache License
@Override public void map(Text key, Text value, Context context) { try {/* www . j a va2 s . c o m*/ LOG.log(Level.INFO, "MAPPER_KEY: ".concat(key.toString()).concat(" MAPPER_VALUE: ".concat(value.toString()))); context.write(key, new IntWritable(Integer.valueOf(value.toString()))); } catch (NumberFormatException | IOException | InterruptedException e) { LOG.log(Level.SEVERE, "ERROR: ".concat(e.toString())); } }
From source file:com.hortonworks.mrunit.MyReducer.java
License:Apache License
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //bunch of processing to extract data to be inserted, in our case, let's say we are simply //appending all the records we receive from the mapper for this particular //key and insert one record into HBase StringBuffer data = new StringBuffer(); Put put = new Put(Bytes.toBytes(key.toString())); for (Text val : values) { data = data.append(val); }//from w w w .j a v a 2s . co m put.addColumn(CF, QUALIFIER, Bytes.toBytes(data.toString())); //write to HBase context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put); }