Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:com.littlehotspot.hadoop.mr.nginx.module.hdfs2hbase.api.user.UserMapper.java

License:Open Source License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    try {/*  w w  w  .  j  a v  a2 s  .c  o  m*/
        String rowLineContent = value.toString();
        //            System.out.println(rowLineContent);
        Matcher matcher = CommonVariables.MAPPER_INPUT_FORMAT_REGEX.matcher(rowLineContent);
        if (!matcher.find()) {
            return;
        }
        String deviceId = matcher.group(16);
        if (StringUtils.isBlank(deviceId)) {
            return;
        }
        Text keyText = new Text(deviceId);
        System.out.println(rowLineContent);
        context.write(keyText, value);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.littlehotspot.hadoop.mr.nginx.module.hdfs2hbase.api.user.UserReducer.java

License:Open Source License

@Override
protected void reduce(Text key, Iterable<Text> value, Reducer<Text, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
    try {/*from  ww  w  . j  av  a 2s .c om*/
        Iterator<Text> textIterator = value.iterator();
        TextTargetUserBean targetUserBean = new TextTargetUserBean();
        while (textIterator.hasNext()) {
            Text item = textIterator.next();
            if (item == null) {
                continue;
            }
            String rowLineContent = item.toString();
            TextSourceUserBean sourceUserBean = new TextSourceUserBean(rowLineContent);
            targetUserBean.setDeviceId(sourceUserBean.getDeviceId());
            targetUserBean.setDeviceType(sourceUserBean.getDeviceType());
            targetUserBean.setMachineModel(sourceUserBean.getMachineModel());
            targetUserBean.setSince(sourceUserBean.getChannelId());
            //                targetUserBean.setToken();
            //                targetUserBean.setDemandTime();
            //                targetUserBean.setProjectionTime();
            this.setDownloadTime(sourceUserBean, targetUserBean);
        }

        CommonVariables.hBaseHelper.insert(targetUserBean);

        context.write(new Text(targetUserBean.rowLine()), new Text());
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.lovelysystems.hive.udf.MemcachedUDF.java

License:Apache License

private MemcachedClient getClient(Text servers) throws IOException {
    if (!clients.containsKey(servers)) {
        MemcachedClient client = new MemcachedClient(defaultConnFactory,
                AddrUtil.getAddresses(servers.toString()));
        clients.put(servers, client);/*w w w.ja  va  2  s  .c  om*/
        return client;
    } else {
        return clients.get(servers);
    }
}

From source file:com.lovelysystems.hive.udf.MemcachedUDF.java

License:Apache License

public IntWritable evaluate(Text servers, Text key, Text value) {
    if (key == null || servers == null) {
        result.set(-1);//from   ww  w  . ja va  2  s.  c om
        return result;
    }
    MemcachedClient client;
    try {
        client = getClient(servers);
    } catch (IOException e) {
        LOG.error("failed to get client: servers=" + servers + " key=" + key + " value=" + value, e);
        result.set(-1);
        return result;
    }
    Future<Boolean> f;
    if (value == null) {
        f = client.delete(key.toString());
        result.set(2);
    } else {

        f = client.set(key.toString(), 0, value.toString());
        result.set(1);
    }
    try {
        f.get();
    } catch (Exception e) {
        LOG.error("failed to set value: servers=" + servers + " key=" + key + " value=" + value, e);
        result.set(-2);
    }
    return result;
}

From source file:com.lovelysystems.hive.udf.UnescapeXMLUDF.java

License:Apache License

public Text evaluate(final Text s) {
    if (s == null) {
        return null;
    } else if (s.find("&") == -1) {
        res.set(s);// www.j  a  v a2  s  .com
    } else {
        res.set(s.toString());
    }
    return res;
}

From source file:com.luca.filipponi.tweetAnalysis.SentimentClassifier.CustomTestNaiveBayesDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from  w  w w .  j a  v a  2s.co  m
    addOutputOption();
    addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
    addOption("model", "m", "The path to the model built during training", true);
    addOption(
            buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
    addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false)));
    addOption("labelIndex", "l", "The path to the location of the label index", true);
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    boolean complementary = hasOption("testComplementary");
    boolean sequential = hasOption("runSequential");
    if (sequential) {
        FileSystem fs = FileSystem.get(getConf());
        NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf());
        AbstractNaiveBayesClassifier classifier;
        if (complementary) {
            classifier = new ComplementaryNaiveBayesClassifier(model);
        } else {
            classifier = new StandardNaiveBayesClassifier(model);
        }
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class,
                VectorWritable.class);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, getInputPath(), getConf());
        Text key = new Text();
        VectorWritable vw = new VectorWritable();
        while (reader.next(key, vw)) {
            writer.append(new Text(SLASH.split(key.toString())[1]),
                    new VectorWritable(classifier.classifyFull(vw.get())));
        }
        writer.close();
        reader.close();
    } else {
        boolean succeeded = runMapReduce(parsedArgs);
        if (!succeeded) {
            return -1;
        }
    }

    //load the labels
    Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));

    //loop over the results and create the confusion matrix
    SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>(
            getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
    ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
    analyzeResults(labelMap, dirIterable, analyzer);

    log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer);
    return 0;
}

From source file:com.lyft.hive.serde.DynamoDbSerDe.java

License:Apache License

@Override
public Object deserialize(Writable blob) throws SerDeException {
    ArrayList<Object> row = buildRow();
    Text rowText = (Text) blob;
    Map<String, String> values = decomposeRow(rowText.toString());

    for (int c = 0; c < numColumns; c++) {
        try {//from w  w w .  jav  a2s .  c om
            String t = values.get(columnNames.get(c));
            TypeInfo typeInfo = columnTypes.get(c);

            // Convert the column to the correct type when needed and set in row obj
            PrimitiveTypeInfo pti = (PrimitiveTypeInfo) typeInfo;
            switch (pti.getPrimitiveCategory()) {
            case STRING:
                row.set(c, t);
                break;
            case BYTE:
                Byte b;
                b = Byte.valueOf(t);
                row.set(c, b);
                break;
            case SHORT:
                Short s;
                s = Short.valueOf(t);
                row.set(c, s);
                break;
            case INT:
                Integer i;
                i = Integer.valueOf(t);
                row.set(c, i);
                break;
            case LONG:
                Long l;
                l = Long.valueOf(t);
                row.set(c, l);
                break;
            case FLOAT:
                Float f;
                f = Float.valueOf(t);
                row.set(c, f);
                break;
            case DOUBLE:
                Double d;
                d = Double.valueOf(t);
                row.set(c, d);
                break;
            case BOOLEAN:
                Boolean bool;
                bool = Boolean.valueOf(t);
                row.set(c, bool);
                break;
            case TIMESTAMP:
                row.set(c, parseTimestamp(t));
                break;
            case DATE:
                Date date;
                date = Date.valueOf(t);
                row.set(c, date);
                break;
            case DECIMAL:
                HiveDecimal bd = HiveDecimal.create(t);
                row.set(c, bd);
                break;
            case CHAR:
                HiveChar hc = new HiveChar(t, ((CharTypeInfo) typeInfo).getLength());
                row.set(c, hc);
                break;
            case VARCHAR:
                HiveVarchar hv = new HiveVarchar(t, ((VarcharTypeInfo) typeInfo).getLength());
                row.set(c, hv);
                break;
            default:
                throw new SerDeException("Unsupported type " + typeInfo);
            }
        } catch (RuntimeException e) {
            row.set(c, null);
        }
    }
    return row;
}

From source file:com.m6d.filecrush.crush.Crush.java

License:Apache License

private void cloneOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    /*/*from  ww w . j av  a2 s  . c  om*/
     * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is
     * used in the subsequent iterations.
     */
    List<Path> crushInput = emptyList();

    Text srcFile = new Text();
    Text crushOut = new Text();
    Text prevCrushOut = new Text();

    for (FileStatus partFile : listStatus) {
        Path path = partFile.getPath();

        Reader reader = new Reader(fs, path, fs.getConf());

        try {
            while (reader.next(srcFile, crushOut)) {
                if (!crushOut.equals(prevCrushOut)) {
                    swap(crushInput, prevCrushOut.toString());

                    prevCrushOut.set(crushOut);
                    crushInput = new LinkedList<Path>();
                }

                crushInput.add(new Path(srcFile.toString()));
            }
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                LOG.warn("Trapped exception when closing " + path, e);
            }
        }

        swap(crushInput, prevCrushOut.toString());
    }

    /*
     * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in
     * the input dir, the difference being there are fewer files in the output dir.
     */
    if (removableFiles.size() > 0) {
        String srcDirName = fs.makeQualified(srcDir).toUri().getPath();
        String destName = fs.makeQualified(dest).toUri().getPath();
        print(Verbosity.INFO, "\n\nMoving removed files to " + destName);
        for (String name : removableFiles) {
            Path srcPath = new Path(name);
            Path destPath = new Path(destName + name).getParent();

            print(Verbosity.INFO, "\n  Moving " + srcPath + " to " + destPath);
            rename(srcPath, destPath, null);
        }
    }
}

From source file:com.m6d.filecrush.crush.Crush.java

License:Apache License

/**
 * Moves the skipped files to the output directory. Called when operation in normal (non-clone) mode.
 *//*from  w  w w.  jav  a 2 s  .com*/
private void moveOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    Text srcFile = new Text();
    Text crushOut = new Text();

    Set<String> crushOutputFiles = new HashSet<String>(nBuckets);

    for (FileStatus partFile : listStatus) {
        Path path = partFile.getPath();

        Reader reader = new Reader(fs, path, fs.getConf());

        try {
            while (reader.next(srcFile, crushOut)) {
                crushOutputFiles.add(new Path(crushOut.toString()).toUri().getPath());
            }
        } finally {
            try {
                reader.close();
            } catch (IOException e) {
                LOG.warn("Trapped exception when closing " + path, e);
            }
        }
    }

    assert crushOutputFiles.size() == nBuckets;

    /*
     * The crushoutput files will appear in a subdirectory of the output directory. The subdirectory will be the full path of the
     * input directory that was crushed. E.g.
     *
     * Crush input:
     * /user/me/input/dir1/file1
     * /user/me/input/dir1/file2
     * /user/me/input/dir2/file3
     * /user/me/input/dir2/file4
     * /user/me/input/dir3/dir4/file5
     * /user/me/input/dir3/dir4/file6
     *
     * Crush output:
     * /user/me/output/user/me/input/dir1/crushed_file ...
     * /user/me/output/user/me/input/dir2/crushed_file ...
     * /user/me/output/user/me/input/dir2/dir3/dir4/crushed_file ...
     *
     * We need to collapse this down to:
     * /user/me/output/dir1/crushed_file ...
     * /user/me/output/dir2/crushed_file ...
     * /user/me/output/dir2/dir3/dir4/crushed_file ...
     */
    String srcDirName = fs.makeQualified(srcDir).toUri().getPath();
    String destName = fs.makeQualified(dest).toUri().getPath();
    String partToReplace = fs.makeQualified(outDir).toUri().getPath() + "/crush" + srcDirName;

    for (String crushOutputFile : crushOutputFiles) {
        Path srcPath = new Path(crushOutputFile);
        Path destPath = new Path(destName + crushOutputFile.substring(partToReplace.length())).getParent();

        print(Verbosity.INFO, "\n  Renaming " + srcPath + " to " + destPath);
        rename(srcPath, destPath, null);
    }

    /*
     * Don't forget to move the files that were not crushed to the output dir so that the output dir has all the data that was in
     * the input dir, the difference being there are fewer files in the output dir.
     */
    if (skippedFiles.size() > 0) {
        print(Verbosity.INFO, "\n\nMoving skipped files to " + destName);

        for (String name : skippedFiles) {
            Path srcPath = new Path(name);
            Path destPath = new Path(destName + name.substring(srcDirName.length())).getParent();

            print(Verbosity.INFO, "\n  Renaming " + srcPath + " to " + destPath);
            rename(srcPath, destPath, null);
        }
    }
}

From source file:com.m6d.filecrush.crush.CrushReducer.java

License:Apache License

@Override
public void reduce(Text bucketId, Iterator<Text> values, OutputCollector<Text, Text> collector,
        Reporter reporter) throws IOException {
    String bucket = bucketId.toString();

    String dirName = bucket.substring(0, bucket.lastIndexOf('-'));

    int idx = findMatcher(dirName);

    String outputFileName = calculateOutputFile(idx, dirName);

    /*//from   w  w  w  .  j  av a2 s  . c  o  m
     * Don't need to separate the paths because the output file name is already absolute.
     */
    valueOut.set(outDirPath + outputFileName);

    LOG.info(format("Crushing bucket '%s' to file '%s'", bucket, outputFileName));

    /*
     * Strip the leading slash to make the path relative. the output format will relativize it to the task attempt work dir.
     */
    RecordWriter<Object, Object> sink = null;
    FileSinkOperator.RecordWriter parquetSink = null;
    Exception rootCause = null;

    Void voidKey = null;
    Object key = null;
    Object value = null;

    String schemaSignature = null;
    String columns = null;
    String columnsTypes = null;
    Properties jobProperties = new Properties();
    boolean firstFile = true;

    try {
        while (null == rootCause && values.hasNext()) {
            Text srcFile = values.next();
            Path inputPath = new Path(srcFile.toString());
            RecordReader<Object, Object> reader = createRecordReader(idx, inputPath, reporter);

            if (firstFile) {
                firstFile = false;

                key = reader.createKey();
                if (null == key)
                    key = NullWritable.get();
                value = reader.createValue();

                if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    schemaSignature = getAvroFileSchemaString(job, inputPath);
                    job.set("avro.schema.literal", schemaSignature);
                } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    MessageType schema = getParquetFileSchema(job, inputPath);
                    List<Type> fieldsFromSchema = schema.getFields();
                    for (Type field : fieldsFromSchema) {
                        if (field.getOriginalType() != null) {
                            if (StringUtils.equals(field.getOriginalType().toString(), "DECIMAL")) {
                                String primitiveType = field.asPrimitiveType().toString();
                                int loc = primitiveType.indexOf("DECIMAL");
                                int start = loc + 7;
                                int end = primitiveType.indexOf(")", loc) + 1;
                                String ps = primitiveType.substring(start, end);
                                if (!decimalTypesHashMap.containsKey(ps)) {
                                    decimalTypesHashMap.put(field.getName().toString(), ps);
                                }
                            }
                        }
                    }
                    schemaSignature = getParquetFileSchemaString(job, inputPath);
                    StringBuilder columnsSb = new StringBuilder();
                    StringBuilder columnsTypesSb = new StringBuilder();
                    boolean firstColumn = true;
                    for (ColumnDescriptor col : schema.getColumns()) {
                        if (firstColumn) {
                            firstColumn = false;
                        } else {
                            columnsSb.append(",");
                            columnsTypesSb.append(",");
                        }
                        columnsSb.append(col.getPath()[0]);
                        String typeName = col.getType().toString();
                        if ("INT96".equals(typeName))
                            typeName = "timestamp";
                        else if ("INT64".equals(typeName))
                            typeName = "bigint";
                        else if ("INT32".equals(typeName))
                            typeName = "int";
                        else if ("INT16".equals(typeName))
                            typeName = "smallint";
                        else if ("INT8".equals(typeName))
                            typeName = "tinyint";
                        else if ("BINARY".equals(typeName))
                            typeName = "string";
                        else if ("BOOLEAN".equals(typeName))
                            typeName = "boolean";
                        else if ("DOUBLE".equals(typeName))
                            typeName = "double";
                        else if ("FLOAT".equals(typeName))
                            typeName = "float";
                        else if (typeName.startsWith("FIXED_LEN_BYTE_ARRAY")) {
                            String column = col.toString();
                            int start = column.indexOf('[') + 1;
                            int end = column.indexOf(']');
                            String fieldName = column.substring(start, end);
                            String lookupVal = decimalTypesHashMap.get(fieldName);
                            LOG.info("final string: decimal" + lookupVal);
                            typeName = "decimal" + lookupVal;
                        }
                        columnsTypesSb.append(typeName);
                    }
                    columns = columnsSb.toString();
                    columnsTypes = columnsTypesSb.toString();
                    jobProperties.put(IOConstants.COLUMNS, columns);
                    jobProperties.put(IOConstants.COLUMNS_TYPES, columnsTypes);
                    parquetSerDe = new ParquetHiveSerDe();
                    parquetSerDe.initialize(job, jobProperties);
                } else {
                    schemaSignature = key.getClass().getName() + ":" + value.getClass().getName();
                }

                /*
                 * Set the key and value class in the conf, which the output format uses to get type information.
                 */
                job.setOutputKeyClass(key.getClass());
                job.setOutputValueClass(value.getClass());

                /*
                 * Output file name is absolute so we can just add it to the crush prefix.
                 */
                if (MapredParquetOutputFormat.class.isAssignableFrom(getOutputFormatClass(idx))) {
                    outputFormat = "parquet";
                    parquetSink = createParquetRecordWriter(idx, valueOut.toString(), jobProperties,
                            (Class<? extends org.apache.hadoop.io.Writable>) value.getClass(), reporter);
                } else {
                    outputFormat = getOutputFormatClass(idx).getName();
                    sink = createRecordWriter(idx, valueOut.toString());
                }
            } else { // next files

                /*
                 * Ensure schema signature is the same as the first file's
                 */
                String nextSchemaSignature = null;
                if (AvroContainerInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    nextSchemaSignature = getAvroFileSchemaString(job, inputPath);
                } else if (MapredParquetInputFormat.class.isAssignableFrom(getInputFormatClass(idx))) {
                    nextSchemaSignature = getParquetFileSchemaString(job, inputPath);
                } else {
                    Object otherKey = reader.createKey();
                    if (otherKey == null)
                        otherKey = NullWritable.get();
                    nextSchemaSignature = otherKey.getClass().getName() + ":"
                            + reader.createValue().getClass().getName();
                }
                if (!schemaSignature.equals(nextSchemaSignature)) {
                    throw new IllegalArgumentException(
                            format("Heterogeneous schema detected in file %s: [%s] != [%s]", inputPath,
                                    nextSchemaSignature, schemaSignature));
                }
            }

            boolean ret;
            if ("parquet".equals(outputFormat))
                ret = reader.next(voidKey, value);
            else
                ret = reader.next(key, value);
            while (ret) {
                if ("text".equals(inputFormat))
                    sink.write(key, null);
                else if (sink != null)
                    sink.write(key, value);
                else {
                    ParquetHiveRecord parquetHiveRecord = new ParquetHiveRecord(value,
                            (StructObjectInspector) parquetSerDe.getObjectInspector());
                    parquetSink.write(parquetHiveRecord);
                }
                reporter.incrCounter(ReducerCounter.RECORDS_CRUSHED, 1);

                if ("parquet".equals(outputFormat))
                    ret = reader.next(voidKey, value);
                else
                    ret = reader.next(key, value);
            }
            //
            /*
             * Output of the reducer is the source file => crushed file (in the final output dir, no the task attempt work dir.
             */
            collector.collect(srcFile, valueOut);
            reporter.incrCounter(ReducerCounter.FILES_CRUSHED, 1);

            recordNumber++;

            if (reportRecordNumber == recordNumber) {
                reportRecordNumber += reportRecordNumber;

                reporter.setStatus(format("Processed %,d files %s : %s", recordNumber, bucket, inputPath));
            }
        }
    } catch (Exception e) {
        rootCause = e;
    } finally {
        if (null != sink) {
            try {
                sink.close(reporter);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }
        if (null != parquetSink) {
            try {
                parquetSink.close(false);
            } catch (Exception e) {
                if (null == rootCause) {
                    rootCause = e;
                } else {
                    LOG.error("Swallowing exception on close of " + outputFileName, e);
                }
            }
        }

        /*
         * Let the exception bubble up with a minimum of wrapping.
         */
        if (null != rootCause) {
            if (rootCause instanceof RuntimeException) {
                throw (RuntimeException) rootCause;
            }

            if (rootCause instanceof IOException) {
                throw (IOException) rootCause;
            }

            throw new RuntimeException(rootCause);
        }
    }
}