Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@Override
public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector,
        Reporter reporter) throws IOException {
    String bucket = bucketId.toString();

    String dirName = bucket.substring(0, bucket.lastIndexOf('-'));

    int idx = findMatcher(dirName);

    String outputFileName = calculateOutputFile(idx, dirName);

    /*//from w  w  w . ja  va 2 s  .c om
     * Don't need to separate the paths because the output file name is already absolute.
     */
    valueOut.set(outDirPath + outputFileName);

    LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName));

    /*
     * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir.
     */
    RecordWriter<Object, Object> sink = null;
    Exception rootCause = null;

    Object key = null;
    Object value = null;

    try {
        while (null == rootCause && values.hasNext()) {
            Text srcFile = values.next();
            Path inputPath = new Path(srcFile.toString());

            RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter);

            try {
                if (null == key) {
                    key = reader.createKey();
                    value = reader.createValue();

                    /*
                     * Set the key and value class in the conf, which the output format uses to get type information.
                     */
                    job.setOutputKeyClass(key.getClass());
                    job.setOutputValueClass(value.getClass());

                    /*
                     * Output file name is absolute so we can just add it to the crush prefix.
                     */
                    sink = createRecordWriter(idx, "crush" + outputFileName);
                } else {

                    Class<?> other = reader.createKey().getClass();

                    if (!(key.getClass().equals(other))) {
                        throw new IllegalArgumentException(format("Heterogeneous keys detected in %s: %s !- %s",
                                inputPath, key.getClass(), other));
                    }

                    other = reader.createValue().getClass();

                    if (!value.getClass().equals(other)) {
                        throw new IllegalArgumentException(
                                format("Heterogeneous values detected in %s: %s !- %s", inputPath,
                                        value.getClass(), other));
                    }
                }

                while (reader.next(key, value)) {
                    sink.write(key, value);
                    reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);
                }
            } catch (Exception e) {
                rootCause = e;
            } finally {
                try {
                    reader.close();
                } catch (Exception e) {
                    if (null == rootCause) {
                        rootCause = e;
                    } else {
                        LOG.debug("Swallowing exception on close of " + inputPath, e);
                    }
                }
            }

            /*
             * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir.
             */
            collector.collect(srcFile, valueOut);
            reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);

            recordNumber++;

            if (reportRecordNumber == recordNumber) {
                reportRecordNumber += reportRecordNumber;

                reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath));
            }
        }
    } catch (Exception e) {
        rootCause = e;
    } finally {
        if (null != sink) {
            try {
                sink.close(reporter);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }

        /*
         * Let the exception bubble up with a minimum of wrapping.
         */
        if (null != rootCause) {
            if (rootCause instanceof RuntimeException) {
                throw (RuntimeException) rootCause;
            }

            if (rootCause instanceof IOException) {
                throw (IOException) rootCause;
            }

            throw new RuntimeException(rootCause);
        }
    }
}

From source file:com.hdfs.concat.crush.CrushStandAloneSequenceFileTest.java

License:Apache License

private void verifyFile(File dir, String fileName, int key, int count) throws IOException {
    File file = new File(dir, fileName);

    Reader reader = new Reader(FileSystem.get(job), new Path(file.getAbsolutePath()), job);

    int i = 0;/*  ww w .  j  av  a 2  s. c o m*/
    int actual = 0;

    Text text = new Text();
    IntWritable value = new IntWritable();

    while (reader.next(text, value)) {
        assertThat(text.toString(), equalTo(Integer.toString(key)));
        assertThat(value.get(), equalTo(i));

        if (i == 9) {
            i = 0;
        } else {
            i++;
        }

        actual++;
    }

    reader.close();

    assertThat(actual, equalTo(count));
}

From source file:com.hdfs.concat.crush.CrushTest.java

License:Apache License

@Test
public void bucketing() throws Exception {
    File in = tmp.newFolder("in");

    Counters expectedCounters = new Counters();
    List<String> expectedBucketFiles = new ArrayList<String>();

    /*//from w  w w . j a  va  2s  .co  m
     * Create a hierarchy of directories. Directories are distinguished by a trailing slash in these comments.
     *
     *   1/
     *         1.1/
     *               file1 10 bytes
     *               file2 20 bytes
     *               file3 30 bytes
     *               file4 41 bytes
     *               file5 15 bytes
     *               file6 30 bytes
     *               file7   20 bytes
     *         1.2/
     *               file1 20 bytes
     *               file2 10 bytes
     *         1.3/
     *   2/
     *         file1 70 bytes
     *         file2 30 bytes
     *         file3 25 bytes
     *         file4 30 bytes
     *         file5 35 bytes
     *         2.1/
     *               file1 10 bytes
     *         2.2/
     *               file1 25 bytes
     *               file2 15 bytes
     *               file3 35 bytes
     *         2.3/
     *               file1 41 bytes
     *               file2 10 bytes
     *         2.4/
     *               2.4.1/
     *                     file1 100 bytes
     *                     file2   30 bytes
     *               2.4.2/
     *                     file1 20 bytes
     *                     file2 20 bytes
     *                     file3 10 bytes
     */

    /*
     * in contains 2 dirs and no files so it is skipped.
     *
     *    in/
     *          1/
     *          2/
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    tmp.newFolder("in/1");
    File dir2 = tmp.newFolder("in/2");

    /*
     * in/1 contains three dirs and no files so it is skipped.
     *
     *    in/
     *          1/
     *                1.1/
     *                1.2/
     *                1.3/
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    File dir1_1 = tmp.newFolder("in/1/1.1");
    File dir1_2 = tmp.newFolder("in/1/1.2");
    tmp.newFolder("in/1/1.3");

    /*
     * in/2 contains five files and four dirs.
     *
     *    in/
     *          2/
     *               file1 70 bytes
     *               file2 30 bytes
     *               file3 25 bytes
     *               file4 30 bytes
     *               file5 35 bytes
     *                2.1/
     *                2.2/
     *                2.3/
     *                2.4/
     *
     *    0                  1                  2
     *    file5   35      file2 30      file4 30
     *                      file3 25
     *
     * Buckets 0 and 2 have a single file each so they are ignored.
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 5);
    expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2);
    expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 3);

    File dir2_1 = tmp.newFolder("in/2/2.1");
    File dir2_2 = tmp.newFolder("in/2/2.2");
    File dir2_3 = tmp.newFolder("in/2/2.3");
    tmp.newFolder("in/2/2.4");

    createFile(dir2, "file1", 70);
    createFile(dir2, "file2", 30);
    createFile(dir2, "file3", 25);
    createFile(dir2, "file4", 30);
    createFile(dir2, "file5", 35);

    expectedBucketFiles
            .add(format("%s   %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file2").getAbsolutePath()));
    expectedBucketFiles
            .add(format("%s   %s", dir2.getAbsolutePath() + "-1", new File(dir2, "file3").getAbsolutePath()));

    /*
     * in/1/1.1 contains seven files and no dirs.
     *
     *    in/
     *          1/
     *                1.1/
     *                     file1 10 bytes
     *                     file2 20 bytes
     *                     file3 30 bytes
     *                     file4 41 bytes
     *                     file5 15 bytes
     *                     file6 30 bytes
     *                     file7   20 bytes
     *
     *    0                  1                  2
     *    file3 30      file6 30      file2 20
     *    file5 15      file1 10      file7 20
     *
     * file4 is > 50 * 0.8 so it is ignored.
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 7);
    expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 6);
    expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1);

    createFile(dir1_1, "file1", 10);
    createFile(dir1_1, "file2", 20);
    createFile(dir1_1, "file3", 30);
    createFile(dir1_1, "file4", 41);
    createFile(dir1_1, "file5", 15);
    createFile(dir1_1, "file6", 30);
    createFile(dir1_1, "file7", 20);

    expectedBucketFiles.add(
            format("%s   %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file3").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir1_1.getAbsolutePath() + "-0", new File(dir1_1, "file5").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file6").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir1_1.getAbsolutePath() + "-1", new File(dir1_1, "file1").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file2").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir1_1.getAbsolutePath() + "-2", new File(dir1_1, "file7").getAbsolutePath()));

    /*
     * in/1/1.2 contains to files.
     *
     *    in/
     *          1/
     *                1.2/
     *                     file1 20 bytes
     *                     file2 10 bytes
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2);
    expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2);

    createFile(dir1_2, "file1", 20);
    createFile(dir1_2, "file2", 10);

    expectedBucketFiles.add(
            format("%s   %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file1").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir1_2.getAbsolutePath() + "-0", new File(dir1_2, "file2").getAbsolutePath()));

    /*
     * in/1/1.3 is empty.
     *
     *    in/
     *          1/
     *                1.3/
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    tmp.newFolder("in/1/1.3");

    /*
     * in/2/2.1 contains on file.
     *
     *    in/
     *          2/
     *                2.1/
     *                     file1 10 bytes
     *
     * Single file dirs are ignored.
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1);

    createFile(dir2_1, "file1", 10);

    /*
     * in/2/2.2 contains three files.
     *
     *    in/
     *          2/
     *                2.2/
     *                     file1 25 bytes
     *                     file2 15 bytes
     *                     file3 35 bytes
     *
     *    0                  1
     *    file3 35      file1 25
     *                      file2 15
     *
     * Bucket 0 with a single file is ignored.
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3);
    expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 2);
    expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 1);

    createFile(dir2_2, "file1", 25);
    createFile(dir2_2, "file2", 15);
    createFile(dir2_2, "file3", 35);

    expectedBucketFiles.add(
            format("%s   %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file1").getAbsolutePath()));
    expectedBucketFiles.add(
            format("%s   %s", dir2_2.getAbsolutePath() + "-1", new File(dir2_2, "file2").getAbsolutePath()));

    /*
     * in/2/2.3 contains 2 files.
     *
     *    in/
     *          2/
     *                2.3/
     *                     file1 41 bytes
     *                     file2 10 bytes
     *
     * file1 is too big and leaving file2 as a single file, which is also ignored.
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2);
    expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2);

    createFile(dir2_3, "file1", 41);
    createFile(dir2_3, "file2", 10);

    /*
     * in/2/2.4 contains two sub directories and no files.
     *
     *    in/
     *          2/
     *               2.4/
     *                     2.4.1/
     *                     2.4.2/
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    tmp.newFolder("in/2/2.4");

    File dir2_4_1 = tmp.newFolder("in/2/2.4/2.4.1");
    File dir2_4_2 = tmp.newFolder("in/2/2.4/2.4.2");

    /*
     *    in/
     *          2/
     *               2.4/
     *                     2.4.1/
     *                           file1 100 bytes
     *                           file2   30 bytes
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 2);
    expectedCounters.incrCounter(MapperCounter.FILES_SKIPPED, 2);

    createFile(dir2_4_1, "file1", 100);
    createFile(dir2_4_1, "file2", 30);

    /*
     *    in/
     *          2/
     *               2.4/
     *                     2.4.2/
     *                           file1 20 bytes
     *                           file2 20 bytes
     *                           file3 10 bytes
     *   0
     *   file1 20
     *   file2 20
     *   file3 10
     */
    expectedCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);
    expectedCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);

    expectedCounters.incrCounter(MapperCounter.FILES_FOUND, 3);
    expectedCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, 3);

    createFile(dir2_4_2, "file1", 20);
    createFile(dir2_4_2, "file2", 20);
    createFile(dir2_4_2, "file3", 10);

    expectedBucketFiles.add(format("%s   %s", dir2_4_2.getAbsolutePath() + "-0",
            new File(dir2_4_2, "file1").getAbsolutePath()));
    expectedBucketFiles.add(format("%s   %s", dir2_4_2.getAbsolutePath() + "-0",
            new File(dir2_4_2, "file2").getAbsolutePath()));
    expectedBucketFiles.add(format("%s   %s", dir2_4_2.getAbsolutePath() + "-0",
            new File(dir2_4_2, "file3").getAbsolutePath()));

    Crush crush = new Crush();

    crush.setConf(job);
    crush.setFileSystem(fileSystem);

    /*
     * Call these in the same order that run() does.
     */
    crush.createJobConfAndParseArgs("--compress=none", "--max-file-blocks=1", in.getAbsolutePath(),
            new File(tmp.getRoot(), "out").getAbsolutePath(), "20101124171730");
    crush.writeDirs();

    /*
     * Verify bucket contents.
     */

    List<String> actualBucketFiles = new ArrayList<String>();

    Text key = new Text();
    Text value = new Text();

    Reader reader = new Reader(FileSystem.get(job), crush.getBucketFiles(), job);

    while (reader.next(key, value)) {
        actualBucketFiles.add(format("%s\t%s", key, value));
    }

    reader.close();

    Collections.sort(expectedBucketFiles);
    Collections.sort(actualBucketFiles);

    assertThat(actualBucketFiles, equalTo(expectedBucketFiles));

    /*
     * Verify the partition map.
     */
    Reader partitionMapReader = new Reader(FileSystem.get(job), crush.getPartitionMap(), job);

    IntWritable partNum = new IntWritable();

    Map<String, Integer> actualPartitions = new HashMap<String, Integer>();

    while (partitionMapReader.next(key, partNum)) {
        actualPartitions.put(key.toString(), partNum.get());
    }

    partitionMapReader.close();

    /*
     * These crush files need to allocated into 5 partitions:
     *
     * in/2-1                  55 bytes
     * in/1/1.1-0            45 bytes
     * in/1/1.1-2            40 bytes
     * in/1/1.1-1            40 bytes
     * in/1/1.2-0            30 bytes
     * in/2/2.2-1            40 bytes
     * in/2/2.4/2.4.2-0   50 bytes
     *
     *    0                     1                                 2                        3                        4
     *    in/2-1 55         in/2/2.4/2.4.2-0   50   in/1/1.1-0   45   in/1/1.1-2   40   in/1/1.1-1   40
     *                                                                                  in/2/2.2-1   40   in/1/1.2-0   39
     */
    Map<String, Integer> expectedPartitions = new HashMap<String, Integer>();

    //TODO: this may not be deterministic due to jvm/hashmap/filesystem
    expectedPartitions.put(dir2.getAbsolutePath() + "-1", 0);
    expectedPartitions.put(dir2_4_2.getAbsolutePath() + "-0", 1);
    expectedPartitions.put(dir1_1.getAbsolutePath() + "-0", 2);
    expectedPartitions.put(dir1_1.getAbsolutePath() + "-2", 4);
    expectedPartitions.put(dir2_2.getAbsolutePath() + "-1", 3);
    expectedPartitions.put(dir1_1.getAbsolutePath() + "-1", 3);
    expectedPartitions.put(dir1_2.getAbsolutePath() + "-0", 4);

    assertThat(actualPartitions, equalTo(expectedPartitions));

    /*
     * Verify counters.
     */
    Counters actualCounters = new Counters();

    DataInputStream countersStream = FileSystem.get(job).open(crush.getCounters());

    actualCounters.readFields(countersStream);

    countersStream.close();

    assertThat(actualCounters, equalTo(expectedCounters));
}

From source file:com.hhscyber.nl.tweets.hbase2.Hbase2Reducer.java

@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
        throws IOException, InterruptedException {
    try {//from  ww w .ja  v a2 s.  c o m
        for (Text value : values) {
            parseJSON(value.toString());
        }

        for (JsonTweet tw : tweets) {
            Put p = new Put(Bytes.toBytes(tw.getId()));
            if (tw.getUser().getId() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("id"), Bytes.toBytes(tw.getUser().getId()));
            }
            if (tw.getUser().getLocation() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("location"),
                        Bytes.toBytes(tw.getUser().getLocation()));
            }
            if (tw.getUser().getDefault_profile() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("default_profile"),
                        Bytes.toBytes(tw.getUser().getDefault_profile()));
            }
            if (tw.getUser().getProfile_background_tile() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("profile_backround_tile"),
                        Bytes.toBytes(tw.getUser().getProfile_background_tile()));
            }
            if (tw.getUser().getStatuses_count() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("statuses_count"),
                        Bytes.toBytes(tw.getUser().getStatuses_count()));
            }
            if (tw.getUser().getLang() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("lang"), Bytes.toBytes(tw.getUser().getLang()));
            }
            if (tw.getUser().getFollowing() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("following"),
                        Bytes.toBytes(tw.getUser().getFollowing()));
            }
            if (tw.getUser().getProtected() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("protected"),
                        Bytes.toBytes(tw.getUser().getProtected()));
            }
            if (tw.getUser().getFavourites_count() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("favorites_count"),
                        Bytes.toBytes(tw.getUser().getFavourites_count()));
            }
            if (tw.getUser().getDescription() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("description"),
                        Bytes.toBytes(tw.getUser().getDescription()));
            }
            if (tw.getUser().getVerified() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("verified"),
                        Bytes.toBytes(tw.getUser().getVerified()));
            }
            if (tw.getUser().getContributors_enabled() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("contributors_enabled"),
                        Bytes.toBytes(tw.getUser().getContributors_enabled()));
            }
            if (tw.getUser().getName() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("name"), Bytes.toBytes(tw.getUser().getName()));
            }
            if (tw.getUser().getCreated_at() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("created_at"),
                        Bytes.toBytes(tw.getUser().getCreated_at()));
            }
            if (tw.getUser().getIs_translation_enabled() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("is_translation_enabled"),
                        Bytes.toBytes(tw.getUser().getIs_translation_enabled()));
            }
            if (tw.getUser().getDefault_profile_image() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("default_profile_image"),
                        Bytes.toBytes(tw.getUser().getDefault_profile_image()));
            }
            if (tw.getUser().getFollowers_count() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("followers_count"),
                        Bytes.toBytes(tw.getUser().getFollowers_count()));
            }
            if (tw.getUser().getHas_extended_profile() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("has_extended_profile"),
                        Bytes.toBytes(tw.getUser().getHas_extended_profile()));
            }
            if (tw.getUser().getProfile_image_url_https() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("profile_image_url_https"),
                        Bytes.toBytes(tw.getUser().getProfile_image_url_https()));
            }
            if (tw.getUser().getGeo_enabled() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("geo_enabled"),
                        Bytes.toBytes(tw.getUser().getGeo_enabled()));
            }
            if (tw.getUser().getProfile_background_image_url_https() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("profile_background_image_url_https"),
                        Bytes.toBytes(tw.getUser().getProfile_background_image_url_https()));
            }
            if (tw.getUser().getUrl() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("url"), Bytes.toBytes(tw.getUser().getUrl()));
            }
            if (tw.getUser().getUtc_offset() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("utc_offset"),
                        Bytes.toBytes(tw.getUser().getUtc_offset()));
            }
            if (tw.getUser().getTime_zone() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("time_zone"),
                        Bytes.toBytes(tw.getUser().getTime_zone()));
            }
            if (tw.getUser().getNotifications() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("notifications"),
                        Bytes.toBytes(tw.getUser().getNotifications()));
            }
            if (tw.getUser().getFriends_count() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("friends_count"),
                        Bytes.toBytes(tw.getUser().getFriends_count()));
            }
            if (tw.getUser().getScreen_name() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("screen_name"),
                        Bytes.toBytes(tw.getUser().getScreen_name()));
            }
            if (tw.getUser().getListed_count() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("listed_count"),
                        Bytes.toBytes(tw.getUser().getListed_count()));
            }
            if (tw.getUser().getIs_translator() != null) {
                p.add(Bytes.toBytes("profile"), Bytes.toBytes("is_translator"),
                        Bytes.toBytes(tw.getUser().getIs_translator()));
            }

            if (tw.getText() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("text"), Bytes.toBytes(tw.getText()));
            }
            if (tw.getKeyword() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("keyword"), Bytes.toBytes(tw.getKeyword()));
            }
            if (tw.getUrls() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("urls"), Bytes.toBytes(tw.getUrls()));
            }

            if (tw.getFavoriteCount() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("favorite_count"),
                        Bytes.toBytes(tw.getFavoriteCount()));
            }
            if (tw.getRetweetCount() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("retweet_count"),
                        Bytes.toBytes(tw.getRetweetCount()));
            }
            if (tw.getContributors() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("contributors"),
                        Bytes.toBytes(tw.getContributors()));
            }
            if (tw.getCoordinates() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("coordinates"),
                        Bytes.toBytes(tw.getCoordinates()));
            }
            if (tw.getCreatedAt() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("created_at"), Bytes.toBytes(tw.getCreatedAt()));
            }
            if (tw.getFavorited() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("favorited"), Bytes.toBytes(tw.getFavorited()));
            }
            if (tw.getGeo() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("geo"), Bytes.toBytes(tw.getGeo()));
            }
            if (tw.getTruncated() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("truncated"), Bytes.toBytes(tw.getTruncated()));
            }
            if (tw.getPlace() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("place"), Bytes.toBytes(tw.getPlace()));
            }
            if (tw.getSource() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("source"), Bytes.toBytes(tw.getSource()));
            }
            if (tw.getLang() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("lang"), Bytes.toBytes(tw.getLang()));
            }
            if (tw.getRetweeted() != null) {
                p.add(Bytes.toBytes("content"), Bytes.toBytes("retweeted"), Bytes.toBytes(tw.getRetweeted()));
            }

            context.write(null, p);

        }
    } catch (IOException | InterruptedException ex) {
        Logger.getLogger(ConcatTweetsReducer.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:com.hhscyber.nl.tweets.hbasefill.HbaseFillMapper.java

@Override
public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException {
    String line = val.toString();
    new JsonParse().openFileTest(line);
}

From source file:com.hhscyber.nl.tweets.processtweets.ProcessTweetsMapper.java

@Override
public void map(LongWritable key, Text val, Context context) throws IOException, InterruptedException {
    String line = val.toString();
    StringTokenizer itr = new StringTokenizer(line.toLowerCase());
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());//from  w ww .  j  a  va  2s  .c o m
        context.write(word, one);
    }

}

From source file:com.hhscyber.nl.tweets.svm.features.SetReducer.java

@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {
    System.out.println(key.toString() + " Counter " + (counter + 1));
    String idx = Integer.toString(++counter);

    if (isValid(key.toString())) {

        String skey = key.toString().toLowerCase();
        stemmer.setCurrent(skey);/* ww w .  java2  s.  c o  m*/
        stemmer.stem();

        Put put = new Put(Bytes.toBytes(idx));
        put.add(Bytes.toBytes("word"), Bytes.toBytes("index"), Bytes.toBytes(stemmer.getCurrent()));

        context.write(null, put);
    }
}

From source file:com.hhscyber.nl.tweets.svm.train.TrainMapper.java

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    String filePathString = ((FileSplit) context.getInputSplit()).getPath().toString();
    String[] tokens = filePathString.split("\\.(?=[^\\.]+$)");
    System.out.println(filePathString);

    Text label;/*w  w  w  .  ja v  a  2s. c om*/
    String line;
    if (tokens[1].equals("bad")) {
        label = new Text("bad");
        line = "1";
    } else {
        label = new Text("good");
        line = "-1";
    }

    Set<Integer> hsline = new HashSet<>();
    StringTokenizer itr = new StringTokenizer(value.toString());
    while (itr.hasMoreTokens()) {
        String tok = itr.nextToken().toLowerCase();
        stemmer.setCurrent(tok);
        stemmer.stem();
        String idx = (String) words.get(stemmer.getCurrent());
        if (idx != null) {
            System.out.println(tok + " IDX " + idx);
            hsline.add(Integer.parseInt(idx));
        }
    }

    List<Integer> asline = new ArrayList<>();
    asline.addAll(hsline);

    Collections.sort(asline);

    for (Integer index : asline) {
        line += " " + index + ":1";
    }
    context.write(label, new Text(line));
}

From source file:com.hortonworks.mapreduce.URLCountM.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) {

    try {/*  www . j  a  va2  s .  c o  m*/
        LOG.log(Level.INFO,
                "MAPPER_KEY: ".concat(key.toString()).concat(" MAPPER_VALUE: ".concat(value.toString())));
        context.write(key, new IntWritable(Integer.valueOf(value.toString())));
    } catch (NumberFormatException | IOException | InterruptedException e) {
        LOG.log(Level.SEVERE, "ERROR: ".concat(e.toString()));
    }
}

From source file:com.hortonworks.mrunit.MyReducer.java

License:Apache License

public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
    //bunch of processing to extract data to be inserted, in our case, let's say we are simply
    //appending all the records we receive from the mapper for this particular
    //key and insert one record into HBase
    StringBuffer data = new StringBuffer();
    Put put = new Put(Bytes.toBytes(key.toString()));
    for (Text val : values) {
        data = data.append(val);
    }//from  w  w  w  .j  a v  a 2s . co  m
    put.addColumn(CF, QUALIFIER, Bytes.toBytes(data.toString()));
    //write to HBase
    context.write(new ImmutableBytesWritable(Bytes.toBytes(key.toString())), put);
}