Example usage for org.apache.hadoop.io Text Text

List of usage examples for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text() 

Source Link

Usage

From source file:cc.slda.AnnotateDocuments.java

License:Apache License

public static Map<Integer, String> importParameter(SequenceFile.Reader sequenceFileReader) throws IOException {
    Map<Integer, String> hashMap = new HashMap<Integer, String>();

    IntWritable intWritable = new IntWritable();
    Text text = new Text();
    while (sequenceFileReader.next(intWritable, text)) {
        hashMap.put(intWritable.get(), text.toString());
    }/* w w w.j a  v  a2  s . c o m*/

    return hashMap;
}

From source file:cc.slda.DisplayTopic.java

License:Apache License

@SuppressWarnings("unchecked")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(Settings.HELP_OPTION, false, "print the help message");
    options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
            .withDescription("input beta file").create(Settings.INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
            .withDescription("term index file").create(ParseCorpus.INDEX));
    options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg()
            .withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION));

    String betaString = null;//from  w  ww.  j a v a2  s  . co m
    String indexString = null;
    int topDisplay = TOP_DISPLAY;

    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
        CommandLine line = parser.parse(options, args);

        if (line.hasOption(Settings.HELP_OPTION)) {
            formatter.printHelp(ParseCorpus.class.getName(), options);
            System.exit(0);
        }

        if (line.hasOption(Settings.INPUT_OPTION)) {
            betaString = line.getOptionValue(Settings.INPUT_OPTION);
        } else {
            throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION + " not initialized...");
        }

        if (line.hasOption(ParseCorpus.INDEX)) {
            indexString = line.getOptionValue(ParseCorpus.INDEX);
        } else {
            throw new ParseException("Parsing failed due to " + ParseCorpus.INDEX + " not initialized...");
        }

        if (line.hasOption(TOP_DISPLAY_OPTION)) {
            topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION));
        }
    } catch (ParseException pe) {
        System.err.println(pe.getMessage());
        formatter.printHelp(ParseCorpus.class.getName(), options);
        System.exit(0);
    } catch (NumberFormatException nfe) {
        System.err.println(nfe.getMessage());
        System.exit(0);
    }

    JobConf conf = new JobConf(DisplayTopic.class);
    FileSystem fs = FileSystem.get(conf);

    Path indexPath = new Path(indexString);
    Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path...");

    Path betaPath = new Path(betaString);
    Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path...");

    SequenceFile.Reader sequenceFileReader = null;
    try {
        IntWritable intWritable = new IntWritable();
        Text text = new Text();
        Map<Integer, String> termIndex = new HashMap<Integer, String>();
        sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf);
        while (sequenceFileReader.next(intWritable, text)) {
            termIndex.put(intWritable.get(), text.toString());
        }

        PairOfIntFloat pairOfIntFloat = new PairOfIntFloat();
        // HMapIFW hmap = new HMapIFW();
        HMapIDW hmap = new HMapIDW();
        TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>();
        sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf);
        while (sequenceFileReader.next(pairOfIntFloat, hmap)) {
            treeMap.clear();

            System.out.println("==============================");
            System.out.println(
                    "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement());
            System.out.println("==============================");

            Iterator<Integer> itr1 = hmap.keySet().iterator();
            int temp1 = 0;
            while (itr1.hasNext()) {
                temp1 = itr1.next();
                treeMap.put(-hmap.get(temp1), temp1);
                if (treeMap.size() > topDisplay) {
                    treeMap.remove(treeMap.lastKey());
                }
            }

            Iterator<Double> itr2 = treeMap.keySet().iterator();
            double temp2 = 0;
            while (itr2.hasNext()) {
                temp2 = itr2.next();
                if (termIndex.containsKey(treeMap.get(temp2))) {
                    System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2);
                } else {
                    System.out.println("How embarrassing! Term index not found...");
                }
            }
        }
    } finally {
        IOUtils.closeStream(sequenceFileReader);
    }

    return 0;
}

From source file:cereal.impl.ProtobufMessageMapping.java

License:Apache License

@Override
public void update(Iterable<Entry<Key, Value>> iter, InstanceOrBuilder<T> obj) {
    checkNotNull(iter, "Iterable was null");
    checkNotNull(obj, "InstanceOrBuilder was null");
    checkArgument(Type.BUILDER == obj.getType(), "Expected argument to be a builder");

    final GeneratedMessage.Builder<?> builder = (GeneratedMessage.Builder<?>) obj.get();
    final List<Entry<Key, Value>> leftoverFields = new LinkedList<>();

    for (Entry<Key, Value> entry : iter) {
        String fieldName = entry.getKey().getColumnQualifier().toString();

        int index = fieldName.indexOf(PERIOD);
        if (0 <= index) {
            leftoverFields.add(entry);//from   ww w .jav  a  2 s. co m
            continue;
        }

        // Find the FieldDescriptor from the Key
        for (FieldDescriptor fieldDesc : builder.getDescriptorForType().getFields()) {
            if (fieldDesc.isRepeated()) {
                int offset = fieldName.lastIndexOf(DOLLAR);
                if (offset < 0) {
                    throw new RuntimeException(
                            "Could not find offset of separator for repeated field count in " + fieldName);
                }
                fieldName = fieldName.substring(0, offset);
            }
            if (fieldName.equals(fieldDesc.getName())) {
                Value value = entry.getValue();
                switch (fieldDesc.getJavaType()) {
                case INT:
                    Integer intVal = Integer.parseInt(value.toString());
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, intVal);
                    } else {
                        builder.setField(fieldDesc, intVal);
                    }
                    break;
                case LONG:
                    Long longVal = Long.parseLong(value.toString());
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, longVal);
                    } else {
                        builder.setField(fieldDesc, longVal);
                    }
                    break;
                case FLOAT:
                    Float floatVal = Float.parseFloat(value.toString());
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, floatVal);
                    } else {
                        builder.setField(fieldDesc, floatVal);
                    }
                    break;
                case DOUBLE:
                    Double doubleVal = Double.parseDouble(value.toString());
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, doubleVal);
                    } else {
                        builder.setField(fieldDesc, doubleVal);
                    }
                    break;
                case BOOLEAN:
                    Boolean booleanVal = Boolean.parseBoolean(value.toString());
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, booleanVal);
                    } else {
                        builder.setField(fieldDesc, booleanVal);
                    }
                    break;
                case STRING:
                    String strVal = value.toString();
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, strVal);
                    } else {
                        builder.setField(fieldDesc, strVal);
                    }
                    break;
                case BYTE_STRING:
                    ByteString byteStrVal = ByteString.copyFrom(entry.getValue().get());
                    if (fieldDesc.isRepeated()) {
                        builder.addRepeatedField(fieldDesc, byteStrVal);
                    } else {
                        builder.setField(fieldDesc, byteStrVal);
                    }
                    break;
                default:
                    log.warn("Ignoring unknown serialized type {}", fieldDesc.getJavaType());
                    break;
                }
                break;
            }
        }
    }

    // All primitives in object should be filled out.
    // Make sure nested messages get filled out too.

    if (!leftoverFields.isEmpty()) {
        for (FieldDescriptor fieldDesc : builder.getDescriptorForType().getFields()) {
            if (JavaType.MESSAGE == fieldDesc.getJavaType()) {
                // For each Key-Value pair which have this prefix as the fieldname (column qualifier)
                final String fieldName = fieldDesc.getName();
                final String singularPrefix = fieldName + PERIOD, repeatedPrefix = fieldName + DOLLAR;

                log.debug("Extracting Key-Value pairs for {}", fieldDesc.getName());

                // Use a TreeMap to ensure the correct repetition order is preserved
                Map<Integer, List<Entry<Key, Value>>> fieldsForNestedMessage = new TreeMap<>();

                final Text _holder = new Text();
                Iterator<Entry<Key, Value>> leftoverFieldsIter = leftoverFields.iterator();
                while (leftoverFieldsIter.hasNext()) {
                    final Entry<Key, Value> entry = leftoverFieldsIter.next();
                    final Key key = entry.getKey();
                    entry.getKey().getColumnQualifier(_holder);

                    String colqual = _holder.toString();
                    if (colqual.startsWith(singularPrefix)) {
                        // Make a copy of the original Key, stripping the prefix off of the qualifier
                        Key copy = new Key(key.getRow(), key.getColumnFamily(),
                                new Text(colqual.substring(singularPrefix.length())), key.getColumnVisibility(),
                                key.getTimestamp());

                        List<Entry<Key, Value>> kvPairs = fieldsForNestedMessage.get(-1);
                        if (null == kvPairs) {
                            kvPairs = new LinkedList<>();
                            fieldsForNestedMessage.put(-1, kvPairs);
                        }
                        kvPairs.add(Maps.immutableEntry(copy, entry.getValue()));

                        // Remove it from the list as we should never have to reread this one again
                        leftoverFieldsIter.remove();
                    } else if (colqual.startsWith(repeatedPrefix)) {
                        // Make a copy of the original Key, stripping the prefix off of the qualifier
                        int index = colqual.indexOf(PERIOD, repeatedPrefix.length());
                        if (0 > index) {
                            throw new RuntimeException("Could not find period after dollar sign: " + colqual);
                        }

                        Integer repetition = Integer
                                .parseInt(colqual.substring(repeatedPrefix.length(), index));

                        Key copy = new Key(key.getRow(), key.getColumnFamily(),
                                new Text(colqual.substring(index + 1)), key.getColumnVisibility(),
                                key.getTimestamp());

                        List<Entry<Key, Value>> kvPairs = fieldsForNestedMessage.get(repetition);
                        if (null == kvPairs) {
                            kvPairs = new LinkedList<>();
                            fieldsForNestedMessage.put(repetition, kvPairs);
                        }
                        kvPairs.add(Maps.immutableEntry(copy, entry.getValue()));

                        // Remove it from the list as we should never have to reread this one again
                        leftoverFieldsIter.remove();
                    }
                }

                if (!fieldsForNestedMessage.isEmpty()) {
                    // We have keys, pass them down to the nested message
                    String nestedMsgClzName = getClassName(fieldDesc);

                    log.debug("Found {} Key-Value pairs for {}. Reconstituting the message.",
                            fieldsForNestedMessage.size(), nestedMsgClzName);

                    try {
                        @SuppressWarnings("unchecked")
                        // Get the class, builder and InstanceOrBuilder for the nested message
                        Class<GeneratedMessage> msgClz = (Class<GeneratedMessage>) Class
                                .forName(nestedMsgClzName);
                        Method newBuilderMethod = msgClz.getMethod("newBuilder");

                        for (Entry<Integer, List<Entry<Key, Value>>> pairsPerRepetition : fieldsForNestedMessage
                                .entrySet()) {
                            Message.Builder subBuilder = (Message.Builder) newBuilderMethod.invoke(null);
                            InstanceOrBuilder<GeneratedMessage> subIob = new InstanceOrBuilderImpl<>(subBuilder,
                                    msgClz);

                            // Get the mapping from the registry
                            ProtobufMessageMapping<GeneratedMessage> subMapping = (ProtobufMessageMapping<GeneratedMessage>) registry
                                    .get(subIob);

                            // Invoke update on the mapping with the subset of Key-Values
                            subMapping.update(pairsPerRepetition.getValue(), subIob);

                            // Set the result on the top-level obj
                            if (fieldDesc.isRepeated()) {
                                builder.addRepeatedField(fieldDesc, subBuilder.build());
                            } else {
                                builder.setField(fieldDesc, subBuilder.build());
                            }
                        }
                    } catch (Exception e) {
                        throw new RuntimeException(e);
                    }
                }
                // No fields for the sub message, therefore it's empty
                log.debug("Found no Key-Value pairs for {}", fieldName);
            }
            // Not a message, so we can ignore it
        }

        if (!leftoverFields.isEmpty()) {
            log.warn("Found {} leftover Key-Value pairs that were not consumed", leftoverFields.size());
        }
    }
}

From source file:clustering.link_back.io.Step2KeyWritable.java

License:Apache License

/**
 * joinKey means entry_id@@g_no,//from w ww. java 2s  .c  o m
 * and tag is the secondary sort field,
 * 1 = cluster_id, 2 = content
 */
public Step2KeyWritable() {
    this.joinKey = new Text();
    this.tag = new IntWritable();
}

From source file:cn.ac.ncic.mastiff.io.coding.ORCStringEcnodingUtil.java

License:Apache License

public String readEachValue(Text previous) throws IOException {
    Text result = null;/*  w  w  w.j a  va2  s.c  o  m*/
    int entry = (int) reader.next();
    if (previous == null) {
        result = new Text();
    } else {
        result = (Text) previous;
    }
    int offset = dictionaryOffsets[entry];
    int length;
    // if it isn't the last entry, subtract the offsets otherwise use
    // the buffer length.
    if (entry < dictionaryOffsets.length - 1) {
        length = dictionaryOffsets[entry + 1] - offset;
    } else {
        length = dictionaryBuffer.size() - offset;
    }
    // If the column is just empty strings, the size will be zero,
    // so the buffer will be null, in that case just return result
    // as it will default to empty
    if (dictionaryBuffer != null) {
        dictionaryBuffer.setText(result, offset, length);
    } else {
        result.clear();
    }
    return result.toString();
}

From source file:cn.ac.ncic.mastiff.io.coding.RedBlackTreeStringReader.java

License:Apache License

public String readEachValue(Text previous) throws IOException {
    Text result = null;/*  www . j  a  v a 2  s  . c o  m*/
    int entry = (int) reader.next();
    if (previous == null) {
        result = new Text();
    } else {
        result = (Text) previous;
    }
    int offset = dictionaryOffsets[entry];
    int length;
    if (entry < dictionaryOffsets.length - 1) {
        length = dictionaryOffsets[entry + 1] - offset;
    } else {
        length = dictionaryBuffer.size() - offset;
    }
    // If the column is just empty strings, the size will be zero,
    // so the buffer will be null, in that case just return result
    // as it will default to empty
    if (dictionaryBuffer != null) {
        dictionaryBuffer.setText(result, offset, length);
    } else {
        result.clear();
    }
    // }
    return result.toString();

}

From source file:cn.com.warlock.SequenceFilesTest.java

License:Apache License

public static void main(String[] args) throws IOException {
    String hdfsUri = "hdfs://hlg-2p238-fandongsheng:8020";
    String pathStr = "/tmp/example/seq1";
    String compressType = "1";

    // ??windows?
    // System.setProperty("hadoop.home.dir", "E:\\tools");

    Configuration conf = new Configuration();
    conf.set("fs.defaultFS", hdfsUri);
    Path path = new Path(pathStr);

    IntWritable key = new IntWritable();
    Text value = new Text();
    SequenceFile.Writer writer = null;
    try {//from  w  ww  .  ja  v a2s.c  o  m
        SequenceFile.Writer.Option pathOpt = SequenceFile.Writer.file(path);
        SequenceFile.Writer.Option keyClassOpt = SequenceFile.Writer.keyClass(key.getClass());
        SequenceFile.Writer.Option valueClassOpt = SequenceFile.Writer.valueClass(value.getClass());
        SequenceFile.Writer.Option compressionOpt = null;

        // compress type
        if (compressType.equals("1")) {
            System.out.println("compress none");
            compressionOpt = SequenceFile.Writer.compression(CompressionType.NONE);
        } else if (compressType.equals("2")) {
            System.out.println("compress record");
            compressionOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
        } else if (compressType.equals("3")) {
            System.out.println("compress block");
            compressionOpt = SequenceFile.Writer.compression(CompressionType.BLOCK);
        } else {
            System.out.println("Default : compress none");
            compressionOpt = SequenceFile.Writer.compression(CompressionType.NONE);
        }

        writer = SequenceFile.createWriter(conf, pathOpt, keyClassOpt, valueClassOpt, compressionOpt);

        for (int i = 0; i < 100; i++) {
            key.set(100 - i);
            value.set(DATA[i % DATA.length]);
            System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value);
            writer.append(key, value);

        }
    } finally {
        IOUtils.closeStream(writer);
    }
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.generator.RecordGenerator.java

@Override
public CrawlDatum next() {
    Text text = new Text();
    CrawlDatum datum = new CrawlDatum();
    boolean hasMore;
    try {//  w  ww .j  a  v  a  2 s .c om
        hasMore = reader.next(text, datum);
    } catch (IOException ex) {
        ex.printStackTrace();
        return null;
    }
    if (hasMore)
        return datum;
    else
        return null;
}

From source file:cn.lhfei.hadoop.ch04.MapFileWriteDemo.java

License:Apache License

public static void main(String[] args) {
    String uri = args[0];//from   w  ww. j  a v  a  2 s .  com
    Configuration conf = new Configuration();
    FileSystem fs = null;

    IntWritable key = new IntWritable();
    Text value = new Text();
    MapFile.Writer writer = null;
    try {
        fs = FileSystem.get(URI.create(uri), conf);
        /*writer = new MapFile.Writer(conf, fs, uri, key.getClass(),
              value.getClass());*/

        writer = new MapFile.Writer(conf, new Path(uri), Writer.keyClass(key.getClass()),
                Writer.valueClass(value.getClass()));

        for (int i = 0; i < 1024; i++) {
            key.set(i + 1);
            value.set(DATA[i % DATA.length]);
            writer.append(key, value);
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        IOUtils.closeStream(writer);
    }
}

From source file:cn.lhfei.hadoop.ch04.SequenceFileWriteDemo.java

License:Apache License

public static void main(String[] args) {

    String uri = args[0];/* www. j  a  v a  2  s  .  co m*/
    Configuration conf = new Configuration();
    FileSystem fs = null;
    SequenceFile.Writer writer = null;

    try {
        fs = FileSystem.get(URI.create(uri), conf);
        Path path = new Path(uri);

        IntWritable key = new IntWritable();
        Text value = new Text();

        //writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());

        writer = SequenceFile.createWriter(conf, Writer.keyClass(key.getClass()),
                writer.valueClass(value.getClass()));

        for (int i = 0; i < 100; i++) {
            key.set(100 - i);
            value.set(DATA[i % DATA.length]);
            System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key, value);
            writer.append(key, value);
        }

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        IOUtils.closeStream(writer);
    }

}