com.lightboxtechnologies.spectrum.SequenceFileExport.java Source code

Java tutorial

Introduction

Here is the source code for com.lightboxtechnologies.spectrum.SequenceFileExport.java

Source

/*
Copyright 2011, Lightbox Technologies, Inc
    
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    
http://www.apache.org/licenses/LICENSE-2.0
    
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.lightboxtechnologies.spectrum;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.util.*;
import org.apache.hadoop.hbase.client.Scan;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.codec.binary.Hex;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;

import org.sleuthkit.hadoop.SKJobFactory;

public class SequenceFileExport {

    private static final Log LOG = LogFactory.getLog(SequenceFileExport.class);

    protected static class SequenceFileExportMapper
            extends Mapper<ImmutableHexWritable, FsEntry, BytesWritable, MapWritable> {

        private final Set<String> Extensions = new HashSet<String>();

        private final BytesWritable OutKey = new BytesWritable();
        private final MapWritable Fields = new MapWritable();
        private final Text FullPath = new Text();
        private final Text Ext = new Text();
        private final Text Sha = new Text();
        private final Text Md5 = new Text();
        // FIXME: IBW instead?
        private final BytesWritable Vid = new BytesWritable();
        private final Text HdfsPath = new Text();

        public SequenceFileExportMapper() {
            Fields.put(new Text("full_path"), FullPath);
            Fields.put(new Text("extension"), Ext);
            Fields.put(new Text("sha1"), Sha);
            Fields.put(new Text("md5"), Md5);
            Fields.put(new Text("data"), Vid);
            Fields.put(new Text("hdfs_path"), HdfsPath);
        }

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            super.setup(context);

            final Configuration conf = context.getConfiguration();

            // get permissible file extensions from the configuration 
            Extensions.clear();
            Extensions.addAll(conf.getStringCollection("extensions"));
        }

        void encodeHex(Text val, FsEntry entry, String field) {
            Object o = entry.get(field);
            if (o != null && o instanceof byte[]) {
                byte[] b = (byte[]) o;
                val.set(new String(Hex.encodeHex(b)));
            } else {
                LOG.warn(entry.fullPath() + " didn't have a hash for " + field);
                val.set("");
            }
        }

        @Override
        public void map(ImmutableHexWritable key, FsEntry value, Context context)
                throws IOException, InterruptedException {
            if (Extensions.contains(value.extension())) {
                FullPath.set(value.fullPath());
                Ext.set(value.extension());

                encodeHex(Sha, value, "sha1");
                encodeHex(Md5, value, "md5");

                if (value.isContentHDFS()) {
                    Vid.setSize(0);
                    HdfsPath.set(value.getContentHdfsPath());
                } else {
                    final byte[] buf = value.getContentBuffer();
                    if (buf == null) {
                        LOG.warn(value.fullPath() + " didn't have a content buffer, skipping.");
                        return;
                    }
                    Vid.set(buf, 0, buf.length);
                    HdfsPath.set("");
                }
                byte[] keybytes = key.get();
                OutKey.set(keybytes, 0, keybytes.length);
                context.write(OutKey, Fields);
            }
        }
    }

    protected static void die() {
        System.err.println("Usage: SequenceFileExport <image_id> <friendlyname> <outpath> <ext> [<ext>]...\n"
                + "       SequenceFileExport -f <ext_file> <image_id> <friendlyname> <outpath>");
        System.exit(2);
    }

    public static void main(String[] args) throws Exception {
        final Configuration conf = new Configuration();

        final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        String imageID;
        String outpath;
        String friendlyname;
        final Set<String> exts = new HashSet<String>();

        if ("-f".equals(otherArgs[0])) {
            if (otherArgs.length != 4) {
                die();
            }

            // load extensions from file
            final Path extpath = new Path(otherArgs[1]);

            InputStream in = null;
            try {
                in = extpath.getFileSystem(conf).open(extpath);

                Reader r = null;
                try {
                    r = new InputStreamReader(in);

                    BufferedReader br = null;
                    try {
                        br = new BufferedReader(r);

                        String line;
                        while ((line = br.readLine()) != null) {
                            exts.add(line.trim().toLowerCase());
                        }

                        br.close();
                    } finally {
                        IOUtils.closeQuietly(br);
                    }

                    r.close();
                } finally {
                    IOUtils.closeQuietly(r);
                }

                in.close();
            } finally {
                IOUtils.closeQuietly(in);
            }

            imageID = otherArgs[2];
            friendlyname = otherArgs[3];
            outpath = otherArgs[4];
        } else {
            if (otherArgs.length < 3) {
                die();
            }

            // read extensions from trailing args
            imageID = otherArgs[0];
            friendlyname = otherArgs[1];
            outpath = otherArgs[2];

            // lowercase all file extensions
            for (int i = 2; i < otherArgs.length; ++i) {
                exts.add(otherArgs[i].toLowerCase());
            }
        }

        conf.setStrings("extensions", exts.toArray(new String[exts.size()]));

        final Job job = SKJobFactory.createJobFromConf(imageID, friendlyname, "SequenceFileExport", conf);
        job.setJarByClass(SequenceFileExport.class);
        job.setMapperClass(SequenceFileExportMapper.class);
        job.setNumReduceTasks(0);

        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(MapWritable.class);

        job.setInputFormatClass(FsEntryHBaseInputFormat.class);
        FsEntryHBaseInputFormat.setupJob(job, imageID);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        FileOutputFormat.setOutputPath(job, new Path(outpath));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}