Example usage for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats)

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

public void doTestTextBatchAppend(boolean useRawLocalFileSystem) throws Exception {
    LOG.debug("Starting...");

    final long rollCount = 10;
    final long batchSize = 2;
    final String fileName = "FlumeData";
    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/*  www .j  a v a  2s  .  c  om*/
    fs.mkdirs(dirPath);

    Context context = new Context();

    // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H");
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.rollInterval", "0");
    context.put("hdfs.rollSize", "0");
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.writeFormat", "Text");
    context.put("hdfs.useRawLocalFileSystem", Boolean.toString(useRawLocalFileSystem));
    context.put("hdfs.fileType", "DataStream");

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel to roll twice
    for (i = 1; i <= (rollCount * 10) / batchSize; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    // check the contents of the all files
    verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testTextAppend()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {

    LOG.debug("Starting...");
    final long rollCount = 3;
    final long batchSize = 2;
    final String fileName = "FlumeData";
    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);//from  ww  w.  j a va 2s  . co  m
    fs.mkdirs(dirPath);

    Context context = new Context();

    // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H");
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.writeFormat", "Text");
    context.put("hdfs.fileType", "DataStream");

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < 4; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testAvroAppend()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {

    LOG.debug("Starting...");
    final long rollCount = 3;
    final long batchSize = 2;
    final String fileName = "FlumeData";
    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/*w ww.jav  a 2  s  .  c  o  m*/
    fs.mkdirs(dirPath);

    Context context = new Context();

    // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H");
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.writeFormat", "Text");
    context.put("hdfs.fileType", "DataStream");
    context.put("serializer", "AVRO_EVENT");

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < 4; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputAvroFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testSimpleAppend()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {

    LOG.debug("Starting...");
    final String fileName = "FlumeData";
    final long rollCount = 5;
    final long batchSize = 2;
    final int numBatches = 4;
    String newPath = testPath + "/singleBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/*w w  w.j  av  a2s.  c o  m*/
    fs.mkdirs(dirPath);

    Context context = new Context();

    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < numBatches; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testSimpleAppendLocalTime()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {
    final long currentTime = System.currentTimeMillis();
    Clock clk = new Clock() {
        @Override//  w ww  . j  a va 2s.  c  o  m
        public long currentTimeMillis() {
            return currentTime;
        }
    };

    LOG.debug("Starting...");
    final String fileName = "FlumeData";
    final long rollCount = 5;
    final long batchSize = 2;
    final int numBatches = 4;
    String newPath = testPath + "/singleBucket/%s";
    String expectedPath = testPath + "/singleBucket/" + String.valueOf(currentTime / 1000);
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(expectedPath);
    fs.delete(dirPath, true);
    fs.mkdirs(dirPath);

    Context context = new Context();

    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.useLocalTimeStamp", String.valueOf(true));

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.setBucketClock(clk);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < numBatches; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
    // The clock in bucketpath is static, so restore the real clock
    sink.setBucketClock(new SystemClock());
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

private void slowAppendTestHelper(long appendTimeout)
        throws InterruptedException, IOException, LifecycleException, EventDeliveryException, IOException {
    final String fileName = "FlumeData";
    final long rollCount = 5;
    final long batchSize = 2;
    final int numBatches = 2;
    String newPath = testPath + "/singleBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/* w w w  . j a  v a  2s  . co  m*/
    fs.mkdirs(dirPath);

    // create HDFS sink with slow writer
    HDFSTestWriterFactory badWriterFactory = new HDFSTestWriterFactory();
    sink = new HDFSEventSink(badWriterFactory);

    Context context = new Context();
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.fileType", HDFSTestWriterFactory.TestSequenceFileType);
    context.put("hdfs.appendTimeout", String.valueOf(appendTimeout));
    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();
    // push the event batches into channel
    for (i = 0; i < numBatches; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            event.getHeaders().put("slow", "1500");
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    // Note that we'll end up with two files with only a head
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testCloseOnIdle() throws IOException, EventDeliveryException, InterruptedException {
    String hdfsPath = testPath + "/idleClose";

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(hdfsPath);
    fs.delete(dirPath, true);/*from  www.  ja v  a2s .c  o m*/
    fs.mkdirs(dirPath);
    Context context = new Context();
    context.put("hdfs.path", hdfsPath);
    /*
     * All three rolling methods are disabled so the only
     * way a file can roll is through the idle timeout.
     */
    context.put("hdfs.rollCount", "0");
    context.put("hdfs.rollSize", "0");
    context.put("hdfs.rollInterval", "0");
    context.put("hdfs.batchSize", "2");
    context.put("hdfs.idleTimeout", "1");
    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Transaction txn = channel.getTransaction();
    txn.begin();
    for (int i = 0; i < 10; i++) {
        Event event = new SimpleEvent();
        event.setBody(("test event " + i).getBytes());
        channel.put(event);
    }
    txn.commit();
    txn.close();

    sink.process();
    sink.process();
    Thread.sleep(1001);
    // previous file should have timed out now
    // this can throw BucketClosedException(from the bucketWriter having
    // closed),this is not an issue as the sink will retry and get a fresh
    // bucketWriter so long as the onClose handler properly removes
    // bucket writers that were closed.
    sink.process();
    sink.process();
    Thread.sleep(500); // shouldn't be enough for a timeout to occur
    sink.process();
    sink.process();
    sink.stop();
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path[] fList = FileUtil.stat2Paths(dirStat);
    Assert.assertEquals("Incorrect content of the directory " + StringUtils.join(fList, ","), 2, fList.length);
    Assert.assertTrue(!fList[0].getName().endsWith(".tmp") && !fList[1].getName().endsWith(".tmp"));
    fs.close();
}

From source file:org.apache.giraph.aggregators.VarianceAggregatorDynamic.java

License:Apache License

@Override
public void aggregate(AggregateMessageCustome value) {

    //int megaStepInd = value.getMegaSlotInd();
    //if(value.getTweetList().get(0).getId() == -1)
    //   return;/*from   w w  w  . j  a va 2 s .  com*/
    //if(neighborList == null)
    //   neighborList = new ArrayList<int[]> ();
    //if(simList == null)
    //   variance = new ArrayList<float> ();
    //if(tweets == null)
    //   tweets = new ArrayList<Tweet>();
    LinkedList<Tweet> tweetList = value.getTweetList();
    LinkedList<Tweet> newList = new LinkedList<Tweet>();

    if (tweetList.size() == 1) { // worker aggregate
        AggregateMessageCustome prev = getAggregatedValue();
        LinkedList<Tweet> newList1 = prev.getTweetList();
        Tweet currTweet = tweetList.get(0);
        boolean checkNext = false;
        //LinkedList<Tweet> missedTweets = new LinkedList<Tweet>();
        for (Tweet tweet : newList1) {
            if (newList.size() == factor * topicNumber) {
                break;
            }
            if (!checkNext) {
                if (tweet.getVariance() >= currTweet.getVariance()) {
                    if (tweet.getConflictList().contains(currTweet.getId())) {
                        newList = newList1;
                        break;
                    }
                    newList.add(tweet);
                } else { // should add the currTweet
                    newList.add(currTweet);
                    checkNext = true;
                    if (newList.size() == factor * topicNumber) {
                        break;
                    }
                    if (!currTweet.getConflictList().contains(tweet.getId())) {
                        newList.add(tweet);
                    } // else {
                      //   missedTweets.add(tweet);
                      //}
                }
            } else {
                if (!currTweet.getConflictList().contains(tweet.getId())) {
                    newList.add(tweet);
                } //else {
                  //missedTweets.add(tweet);
                  //}
            }
        }
        if ((newList.size() < factor * topicNumber) && !checkNext) {
            newList.add(currTweet);
        }
        //Iterator<Tweet> missedIter = missedTweets.iterator();
        //while (newList.size() < factor * topicNumber) {
        //   if(!missedIter.hasNext())
        //      break;
        //   newList.add(missedIter.next());
        //}
    }
    if (tweetList.size() > 1) { // master aggregate
        AggregateMessageCustome prev = getAggregatedValue();
        newList = prev.getTweetList();
        for (Tweet tweet : tweetList) {
            newList.add(tweet);
        }
        stage++;
        System.out.println("stage value = " + stage);
    }

    getAggregatedValue().setTweetList(newList);
    //getAggregatedValue().setMegaSlotInd(megaStepInd);

    if (tweetList.size() > 1 && (stage % workerNum == 0) && (stage != 0)) {
        try {
            //System.out.println("About to write in the aggregator");
            //Path pt = new Path("/user/exp/ahmed/20k_10k/output/timeslot-0");
            //int numMegaStep = (stage/workerNum) - 1;

            FileSystem fs = FileSystem.get(new Configuration());
            FileStatus[] fileStatus = fs.listStatus(new Path("/user/exp/ahmed/50k_sliding_20k_output"));
            Path[] paths = FileUtil.stat2Paths(fileStatus);
            int numMegaStep = paths.length;
            //System.out.println("Current numMegaStep = "+numMegaStep);
            String outFileName = "/user/exp/ahmed/50k_sliding_20k_output/timeslot-" + numMegaStep + ".txt";
            Path pt = new Path(outFileName);
            System.out.println("Writing output to " + outFileName + ". pt = " + pt);
            BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
            Collections.sort(newList);
            /*
            String varFileName = "/user/exp/ahmed/20k_sliding_10k_output/var-"+ numMegaStep + ".txt";
            Path varpt = new Path(varFileName);
            System.out.println("Writing output to " + varFileName + ". pt = " + varpt);
            BufferedWriter varbr = new BufferedWriter(new OutputStreamWriter(fs.create(varpt,true)));
                    
            for(Tweet tweet : newList){ 
               varbr.write(tweet.getId()+"\t"+tweet.getVariance());
               varbr.newLine();
            }
            varbr.close();
            */
            HashSet<Integer> selectedTweets = new HashSet<Integer>();
            int lastTopic = -1;
            int topicCount = 0;
            Iterator<Tweet> tweetItr = newList.iterator();
            Tweet nextTweet = tweetItr.next();

            Loop: while (topicCount < topicNumber) {
                while ((nextTweet.getConflictList().contains(lastTopic)) && (lastTopic != -1)) {
                    if (!tweetItr.hasNext())
                        break Loop;
                    nextTweet = tweetItr.next();
                }
                lastTopic = nextTweet.getId();
                selectedTweets.add(lastTopic);
                if (!tweetItr.hasNext())
                    break;
                nextTweet = tweetItr.next();
                topicCount++;
            }
            // get tweet text
            String fileName = "/user/exp/ahmed/50k_sliding_20k_dynamic_tweets/tweets-" + numMegaStep;
            Path ptRead = new Path(fileName);
            FSDataInputStream reader = fs.open(ptRead);
            System.out.println("In agg. Getting tweet text from " + fileName + ". reader = " + reader);
            String line = null;
            //System.out.println("Reader created " + reader + ", content = " + reader.readLine());
            while ((line = reader.readLine()) != null) {
                String[] splits = line.split("\t");
                int id = Integer.parseInt(splits[0]);
                //System.out.println("About to write");
                if (selectedTweets.contains(id) && splits.length >= 2) {
                    //System.out.println(splits[1]);
                    br.write(splits[1]);
                    br.newLine();
                }
            }
            reader.close();
            br.close();
        } catch (IOException ex) {
            System.out.println("In aggregator: " + ex.getMessage());
        }
    }

    //vertexes.add(vertexId);
    //neighborList.add(neighbor);
    //variance.add(value.getVar());
    //System.out.println(tweets.size());

}

From source file:org.apache.hama.bsp.CombineFileInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(BSPJob bspJob, int numSplits) throws IOException {

    Configuration job = bspJob.getConfiguration();

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*w  w  w.java 2 s. c  o  m*/
    } else {
        minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = job.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(bspJob));
    List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
    if (paths.length == 0) {
        return splits.toArray(new CombineFileSplit[splits.size()]);
    }

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contans paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (int i = 0; i < paths.length; i++) {
            if (paths[i] == null) { // already processed
                continue;
            }
            Path p = new Path(paths[i].toUri().getPath());
            if (onepool.accept(p)) {
                myPaths.add(paths[i]); // add it to my output set
                paths[i] = null; // already processed
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(bspJob, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // Finally, process all paths that do not belong to any pool.
    ArrayList<Path> myPaths = new ArrayList<Path>();
    for (Path path : paths) {
        if (path == null) { // already processed
            continue;
        }
        myPaths.add(path);
    }
    // create splits for all files that are not in any pool.
    getMoreSplits(bspJob, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits.toArray(new CombineFileSplit[splits.size()]);
}

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java

License:Apache License

public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");
    log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri());
    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    log.info("Create Hmm Model. File System = {}", fs);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());//from w  ww .  j  a  v a2 s  .  c  o  m

    for (FileStatus match : matches) {
        log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString());
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            log.info("CreateHmmModel Matching Seq File Key = {}", key);
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Initial Prob Adding  Key, Value  = ({} {})",
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get());
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                // the number after _ is the state ID at char number 11
                int stateID = Character.getNumericValue(key.charAt(8));
                log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                // the number after _ is the state ID at char number 5
                int stateID = Character.getNumericValue(key.charAt(5));
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }
    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
    HmmUtils.validate(model);
    return model;
}