Example usage for org.apache.hadoop.fs FileSystem get

List of usage examples for org.apache.hadoop.fs FileSystem get

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem get.

Prototype

public static FileSystem get(Configuration conf) throws IOException 

Source Link

Document

Returns the configured FileSystem implementation.

Usage

From source file:FormatStorageBasicTest.java

License:Open Source License

public void testGetRecordByOrderSegment() {
    Segment segment = null;/* w  w w  .  j  a  v  a2  s . c  om*/
    try {
        FieldMap fieldMap = new FieldMap();
        fieldMap.addField(new Field(ConstVar.FieldType_Byte, ConstVar.Sizeof_Byte, (short) 0));
        fieldMap.addField(new Field(ConstVar.FieldType_Short, ConstVar.Sizeof_Short, (short) 1));
        fieldMap.addField(new Field(ConstVar.FieldType_Int, ConstVar.Sizeof_Int, (short) 2));
        fieldMap.addField(new Field(ConstVar.FieldType_Long, ConstVar.Sizeof_Long, (short) 3));
        fieldMap.addField(new Field(ConstVar.FieldType_Float, ConstVar.Sizeof_Float, (short) 4));
        fieldMap.addField(new Field(ConstVar.FieldType_Double, ConstVar.Sizeof_Double, (short) 5));
        fieldMap.addField(new Field(ConstVar.FieldType_String, 0, (short) 6));

        Head head = new Head();
        head.setFieldMap(fieldMap);

        String fileName = prefix + "testGetRecordByValueSegment";
        Path path = new Path(fileName);
        FileSystem fs = FileSystem.get(new Configuration());
        FSDataOutputStream out = fs.create(path);

        Configuration conf = new Configuration();
        FormatDataFile fd = new FormatDataFile(conf);
        fd.setWorkStatus(ConstVar.WS_Write);
        fd.head = head;
        fd.setOut(out);

        IndexInfo info = new IndexInfo();
        info.offset = 0;

        segment = new Segment(info, fd);

        int recordNum = 150000;
        for (int i = 0; i < recordNum; i++) {
            Record record = new Record(7);
            record.addValue(new FieldValue((byte) 1, (short) 0));
            record.addValue(new FieldValue((short) 2, (short) 1));
            record.addValue(new FieldValue((int) 3, (short) 2));
            record.addValue(new FieldValue((long) 4, (short) 3));
            record.addValue(new FieldValue((float) 5.5, (short) 4));
            record.addValue(new FieldValue((double) 6.6, (short) 5));
            record.addValue(new FieldValue("hello konten", (short) 6));

            segment.addRecord(record);
            record = null;
        }

        segment.persistent(out);
        out.close();

        FSDataInputStream in = fs.open(path);

        fd.setIn(in);
        fd.setWorkStatus(ConstVar.WS_Read);

        info.offset = 0;
        info.len = segment.len();

        Segment segment2 = new Segment(info, fd);

        FieldValue[] values = new FieldValue[2];
        values[0] = new FieldValue((byte) 1, (short) 2);
        values[1] = new FieldValue((short) 2, (short) 3);
        Record[] records = segment2.getRecordByOrder(values, values.length);
        if (records != null) {
            fail("should get null, index error, records.len:" + records.length);
        }

        values[0] = new FieldValue((byte) 1, (short) 0);
        values[1] = new FieldValue((short) 3, (short) 1);
        records = segment2.getRecordByOrder(values, values.length);
        if (records != null) {
            fail("should get null, value error");
        }

        values[0] = new FieldValue((byte) 1, (short) 0);
        values[1] = new FieldValue((short) 2, (short) 1);
        records = segment2.getRecordByOrder(values, values.length);
        if (records == null) {
            fail("should not get null");
        }
        if (records.length != 150000) {
            fail("error result size:" + records.length);
        }

        for (int i = 0; i < 150000; i++) {
            judgeFixedRecord(records[i]);
        }

        records = segment2.getRecordByOrder(null, 0);
        if (records == null) {
            fail("should not get null");
        }
        if (records.length != 150000) {
            fail("error result size:" + records.length);
        }

        for (int i = 0; i < 150000; i++) {
            judgeFixedRecord(records[i]);
        }
    } catch (IOException e) {
        e.printStackTrace();
        fail("get IOException:" + e.getMessage());
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:FormatStorageBasicTest.java

License:Open Source License

public void testSetNoPreFileName() {
    try {/*from   w w  w.  j  a  v a  2 s . c  om*/
        Configuration conf = new Configuration();
        FormatDataFile fd = new FormatDataFile(conf);
        Head head = new Head();

        String fileName = prefix + "testSetNoPreFileName";
        fd.create(fileName, head);

        Path path = new Path(fileName);
        FileSystem fs = FileSystem.get(new Configuration());
        if (!fs.isFile(path)) {
            fail("create file fail");
        }
    } catch (IOException e) {
        e.printStackTrace();
        fail("get ioexception:" + e.getMessage());
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }

}

From source file:FormatStorageBasicTest.java

License:Open Source License

public void testSetPreFileName() {
    try {// w w w . j av a  2 s . c  om
        Configuration conf = new Configuration();
        FormatDataFile fd = new FormatDataFile(conf);
        Head head = new Head();

        //String fileName = "hdfs://tdw-172-25-38-246:54310/user/tdwadmin/testSetPreFileName";
        String fileName = "/user/tdwadmin/se_test/fs/basic/testSetPreFileName";
        fd.create(fileName, head);
        fd.close();

        Path path = new Path(fileName);
        FileSystem fs = FileSystem.get(new Configuration());
        if (!fs.isFile(path)) {
            fail("create file fail");
        }
    } catch (IOException e) {
        fail(e.getMessage());
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:FormatStorageBasicTest.java

License:Open Source License

public void testClose() {
    try {//  ww w.  ja  v a 2 s. c o m
        Head head = new Head();
        head.setVar((byte) 1);
        Configuration conf = new Configuration();
        FormatDataFile fd = new FormatDataFile(conf);
        fd.create(prefix + "testClose", head);

        int size = 100 * 10000;

        for (int i = 0; i < size; i++) {
            Record record = new Record(7);
            record.addValue(new FieldValue((byte) 1, (short) 0));
            record.addValue(new FieldValue((short) 2, (short) 1));
            record.addValue(new FieldValue((int) 3, (short) 2));
            record.addValue(new FieldValue((long) 4, (short) 3));
            record.addValue(new FieldValue((float) 5.5, (short) 4));
            record.addValue(new FieldValue((double) 6.6, (short) 5));
            record.addValue(new FieldValue("hello konten", (short) 6));
            fd.addRecord(record);
        }

        if (fd.recordNum() != size) {
            fail("error record num:" + fd.recordNum());
        }
        if (fd.currentSegment().currentUnit() == null) {
            fail("null current unit");
        }
        if (fd.currentSegment() == null) {
            fail("null current seg");
        }
        if (fd.segmentNum() != 0) {
            fail("error segment num:" + fd.segmentNum());
        }

        int headLen = head.len();
        long currentUnitLen = fd.currentSegment().currentUnit().len();
        long segmentLen = fd.currentSegment().len() + currentUnitLen + ConstVar.LineIndexRecordLen;
        long remain = fd.currentSegment().remain();
        int unitNum = fd.currentSegment().unitNum();
        fd.close();

        int indexLen = ConstVar.LineIndexRecordLen * fd.segmentNum();
        int metaLen = ConstVar.IndexMetaOffset;

        long fileLen = fd.getFileLen();

        if (fileLen != headLen + segmentLen + indexLen + metaLen) {
            fail("error file len:" + fileLen);
        }

        if (fd.in() != null) {
            fail("in should set null");
        }
        if (fd.out() != null) {
            fail("out should set null");
        }
        if (fd.recordNum() != 0) {
            fail("record num should set 0");
        }
        if (fd.keyIndexOffset != -1) {
            fail("key index offset not -1");
        }
        if (fd.lineIndexOffset != -1) {
            fail("line index offset not -1");
        }
        if (fd.currentOffset != -1) {
            fail("current offset not -1");
        }
        if (fd.hasLoadAllSegmentDone) {
            fail("has load all segment Done not false");
        }

        String fileName = prefix + "testClose";
        Path path = new Path(fileName);
        FileSystem fs = FileSystem.get(new Configuration());
        FSDataInputStream in = fs.open(path);

        long metaOffset = fileLen - ConstVar.IndexMetaOffset;
        in.seek(metaOffset);

        int recordNum = in.readInt();
        int segNum = in.readInt();
        long keyIndexOffset = in.readLong();
        long lineIndexOffset = in.readLong();

        if (recordNum != size) {
            fail("error record num:" + recordNum);
        }
        if (segNum != 1) {
            fail("error segNum:" + segNum);
        }
        if (keyIndexOffset != -1) {
            fail("error key index offset:" + keyIndexOffset);
        }
        if (lineIndexOffset != (headLen + segmentLen)) {
            fail("error line index offset:" + lineIndexOffset);
        }

        in.seek(lineIndexOffset);
        for (int i = 0; i < segNum; i++) {
            int beginLine = in.readInt();
            int endLine = in.readInt();
            long offset = in.readLong();
            long len = in.readLong();
            int idx = in.readInt();

            if (beginLine != 0) {
                fail("error beginLine:" + beginLine);
            }
            if (endLine != size) {
                fail("error end line:" + endLine);
            }
            if (offset != head.len()) {
                fail("error offset:" + offset);
            }
            long tlen = size * full7chunkLen + size * 8 + ConstVar.DataChunkMetaOffset * (unitNum + 1)
                    + 28 * (unitNum + 1) + 24;
            if (len != tlen) {
                fail("error len:" + len);
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        fail("get ioexception:" + e.getMessage());
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:FormatStorageBasicTest.java

License:Open Source License

public void testNoFillLastSegment() {
    try {//  ww  w  . ja va  2s. c  o m
        String fileName = prefix + "testNoFillLastSegment";
        Head head = new Head();

        FieldMap fieldMap = new FieldMap();
        fieldMap.addField(new Field(ConstVar.FieldType_Byte, ConstVar.Sizeof_Byte, (short) 0));
        fieldMap.addField(new Field(ConstVar.FieldType_Short, ConstVar.Sizeof_Short, (short) 1));
        fieldMap.addField(new Field(ConstVar.FieldType_Int, ConstVar.Sizeof_Int, (short) 2));
        fieldMap.addField(new Field(ConstVar.FieldType_Long, ConstVar.Sizeof_Long, (short) 3));
        fieldMap.addField(new Field(ConstVar.FieldType_Float, ConstVar.Sizeof_Float, (short) 4));
        fieldMap.addField(new Field(ConstVar.FieldType_Double, ConstVar.Sizeof_Double, (short) 5));
        fieldMap.addField(new Field(ConstVar.FieldType_String, 0, (short) 6));

        head.setFieldMap(fieldMap);

        Configuration conf = new Configuration();
        FormatDataFile fd = new FormatDataFile(conf);
        fd.create(fileName, head);

        Record record = new Record(7);
        record.addValue(new FieldValue((byte) 1, (short) 0));
        record.addValue(new FieldValue((short) 2, (short) 1));
        record.addValue(new FieldValue((int) 3, (short) 2));
        record.addValue(new FieldValue((long) 4, (short) 3));
        record.addValue(new FieldValue((float) 5.5, (short) 4));
        record.addValue(new FieldValue((double) 6.6, (short) 5));
        record.addValue(new FieldValue("hello konten", (short) 6));

        fd.addRecord(record);

        fd.close();

        FileSystem fs = FileSystem.get(conf);
        long fileLen = fs.getFileStatus(new Path(fileName)).getLen();

        int tlen = head.len() + full7chunkLen + 8 + ConstVar.DataChunkMetaOffset + ConstVar.LineIndexRecordLen
                + ConstVar.IndexMetaOffset + ConstVar.LineIndexRecordLen + ConstVar.IndexMetaOffset;
        if (fileLen != tlen) {
            fail("error file len:" + fileLen);
        }

        FormatDataFile fd2 = new FormatDataFile(new Configuration());
        fd2.open(fileName);
        if (fd2.recordNum() != 1) {
            fail("error record num:" + fd2.recordNum());
        }
        if (fd2.segmentNum() != 1) {
            fail("error segment num:" + fd2.segmentNum());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:FormatStorageBasicTest.java

License:Open Source License

public void testOpenNoRecord() {
    try {/*from w  w  w.j  av  a2 s.co m*/
        String fileName = prefix + "testOpenNoRecord";
        Head head = new Head();
        FormatDataFile fd = new FormatDataFile(new Configuration());
        fd.create(fileName, head);
        fd.close();

        FileSystem fs = FileSystem.get(new Configuration());
        long fileLen = fs.getFileStatus(new Path(fileName)).getLen();
        if (fileLen != head.len() + ConstVar.IndexMetaOffset) {
            fail("error file len:" + fileLen);
        }

        FormatDataFile fd2 = new FormatDataFile(new Configuration());
        fd2.open(fileName);
        if (fd2.recordNum() != 0) {
            fail("error record num:" + fd2.recordNum());
        }
        if (fd2.segmentNum() != 0) {
            fail("error segment num:" + fd2.segmentNum());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:WikipediaForwardIndexBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override//w w w .jav  a2  s  .c o  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    // Clean up.
    fs.delete(new Path(tmpPath), true);

    return 0;
}

From source file:RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws Exception {
    Configuration conf = getConf();

    String in = path + "/iter" + FORMAT.format(i);
    String out = path + "/iter" + FORMAT.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // We need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log).
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-")) {
            numPartitions++;//from ww w.  j  a  v a2  s .  c  om
        }
    }

    conf.setInt("NodeCount", n);

    Partitioner<IntWritable, Writable> p = null;

    if (useRange) {
        p = new RangePartitioner();
        ((Configurable) p).setConf(conf);
    } else {
        p = new HashPartitioner<IntWritable, Writable>();
    }

    // This is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (!f.getPath().getName().contains("part-")) {
            continue;
        }

        SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(f.getPath()));

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        LOG.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + ";");
    }

    LOG.info(sb.toString().trim());

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase1");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);
    LOG.info(" - nodeCnt: " + n);
    LOG.info(" - useCombiner: " + useCombiner);
    LOG.info(" - useInmapCombiner: " + useInmapCombiner);
    LOG.info(" - numPartitions: " + numPartitions);
    LOG.info(" - useRange: " + useRange);
    LOG.info("computed number of partitions: " + numPartitions);

    int numReduceTasks = numPartitions;

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    //conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    Job job = Job.getInstance(conf);
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");
    job.setJarByClass(RunPageRankSchimmy.class);

    job.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        job.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        job.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        job.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        job.setPartitionerClass(RangePartitioner.class);
    }

    job.setReducerClass(ReduceClass.class);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:RunPageRankSchimmy.java

License:Apache License

private void phase2(String path, int i, int j, int n, float missing) throws Exception {
    Configuration conf = getConf();

    LOG.info("missing PageRank mass: " + missing);
    LOG.info("number of nodes: " + n);

    String in = path + "/iter" + FORMAT.format(j) + "t";
    String out = path + "/iter" + FORMAT.format(j);

    LOG.info("PageRankSchimmy: iteration " + j + ": Phase2");
    LOG.info(" - input: " + in);
    LOG.info(" - output: " + out);

    Job job = Job.getInstance(conf);/*from   w  ww  .  j a v  a2s  .  c  o m*/
    job.setJobName("PageRankSchimmy:iteration" + j + ":Phase2");
    job.setJarByClass(RunPageRankSchimmy.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(in));
    FileOutputFormat.setOutputPath(job, new Path(out));

    job.setInputFormatClass(NonSplitableSequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(PageRankNode.class);

    job.setMapperClass(MapPageRankMassDistributionClass.class);

    conf.setFloat("MissingMass", (float) missing);
    conf.setInt("NodeCount", n);

    FileSystem.get(conf).delete(new Path(out), true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
}

From source file:TaskSearchWords.java

public static void main(String[] args) throws Exception {

    String hadoopServer = "ip-172-31-13-245.ap-southeast-1.compute.internal";

    Configuration conf = new Configuration();

    // this should be like defined in your mapred-site.xml
    conf.set("mapred.job.tracker", hadoopServer + ":54311");

    // like defined in hdfs-site.xml
    conf.set("fs.default.name", "hdfs://" + hadoopServer + ":9000");

    //setting mapred classes for HDFS to know which classes to process
    conf.set("mapreduce.map.class", "TokenizerMapper");
    conf.set("mapreduce.reduce.class", "IntSumReducer");

    //to prevent classdefnotfound exception
    conf.set("mapred.jar", "C:\\GitRepos\\OCR\\HadoopTasks\\dist\\HadoopTasks.jar");

    //to pass parameters to mapred classes
    conf.set("RAWOCRCLOB",
            "Omeprazole_Cap E/C 10mg\n" + "Dressit Ster esDress\n" + "Flaminal Forte 15g\n"
                    + "Co-Magaldrox_Susp 195mg/220mg/5ml S/F\n" + "Antacid/Oxetacaine_Oral Susp S/F\n"
                    + "Simeticone_Susp 40mg/ml S/F\n" + "Infacol_Susp 40mg/ml S/F");

    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(TaskSearchWords.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path("/user/ubuntu/MedicinesProcessed.csv"));
    FileSystem fs = FileSystem.get(conf);
    Path out = new Path("/user/ubuntu/processed/");
    fs.delete(out, true);/*from ww  w. j a  v a2 s . c  o  m*/

    //finally set the empty out path
    FileOutputFormat.setOutputPath(job, out);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}