Example usage for org.apache.hadoop.fs FileUtil stat2Paths

List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats) 

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

public void doTestTextBatchAppend(boolean useRawLocalFileSystem) throws Exception {
    LOG.debug("Starting...");

    final long rollCount = 10;
    final long batchSize = 2;
    final String fileName = "FlumeData";
    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/*  www .j  a v a  2s  .  c  om*/
    fs.mkdirs(dirPath);

    Context context = new Context();

    // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H");
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.rollInterval", "0");
    context.put("hdfs.rollSize", "0");
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.writeFormat", "Text");
    context.put("hdfs.useRawLocalFileSystem", Boolean.toString(useRawLocalFileSystem));
    context.put("hdfs.fileType", "DataStream");

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel to roll twice
    for (i = 1; i <= (rollCount * 10) / batchSize; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    // check the contents of the all files
    verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testTextAppend()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {

    LOG.debug("Starting...");
    final long rollCount = 3;
    final long batchSize = 2;
    final String fileName = "FlumeData";
    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);//from  ww  w.  j a va 2s  . co  m
    fs.mkdirs(dirPath);

    Context context = new Context();

    // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H");
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.writeFormat", "Text");
    context.put("hdfs.fileType", "DataStream");

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < 4; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testAvroAppend()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {

    LOG.debug("Starting...");
    final long rollCount = 3;
    final long batchSize = 2;
    final String fileName = "FlumeData";
    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/*w ww.jav  a 2  s  .  c  o  m*/
    fs.mkdirs(dirPath);

    Context context = new Context();

    // context.put("hdfs.path", testPath + "/%Y-%m-%d/%H");
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.writeFormat", "Text");
    context.put("hdfs.fileType", "DataStream");
    context.put("serializer", "AVRO_EVENT");

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < 4; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputAvroFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testSimpleAppend()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {

    LOG.debug("Starting...");
    final String fileName = "FlumeData";
    final long rollCount = 5;
    final long batchSize = 2;
    final int numBatches = 4;
    String newPath = testPath + "/singleBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/*w w  w.j  av  a2s.  c o  m*/
    fs.mkdirs(dirPath);

    Context context = new Context();

    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < numBatches; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testSimpleAppendLocalTime()
        throws InterruptedException, LifecycleException, EventDeliveryException, IOException {
    final long currentTime = System.currentTimeMillis();
    Clock clk = new Clock() {
        @Override//  w ww  . j  a va 2s.  c  o  m
        public long currentTimeMillis() {
            return currentTime;
        }
    };

    LOG.debug("Starting...");
    final String fileName = "FlumeData";
    final long rollCount = 5;
    final long batchSize = 2;
    final int numBatches = 4;
    String newPath = testPath + "/singleBucket/%s";
    String expectedPath = testPath + "/singleBucket/" + String.valueOf(currentTime / 1000);
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(expectedPath);
    fs.delete(dirPath, true);
    fs.mkdirs(dirPath);

    Context context = new Context();

    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.useLocalTimeStamp", String.valueOf(true));

    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.setBucketClock(clk);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();

    // push the event batches into channel
    for (i = 1; i < numBatches; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
    // The clock in bucketpath is static, so restore the real clock
    sink.setBucketClock(new SystemClock());
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

private void slowAppendTestHelper(long appendTimeout)
        throws InterruptedException, IOException, LifecycleException, EventDeliveryException, IOException {
    final String fileName = "FlumeData";
    final long rollCount = 5;
    final long batchSize = 2;
    final int numBatches = 2;
    String newPath = testPath + "/singleBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);/* w w w  . j a  v a  2s  . co  m*/
    fs.mkdirs(dirPath);

    // create HDFS sink with slow writer
    HDFSTestWriterFactory badWriterFactory = new HDFSTestWriterFactory();
    sink = new HDFSEventSink(badWriterFactory);

    Context context = new Context();
    context.put("hdfs.path", newPath);
    context.put("hdfs.filePrefix", fileName);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.fileType", HDFSTestWriterFactory.TestSequenceFileType);
    context.put("hdfs.appendTimeout", String.valueOf(appendTimeout));
    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    List<String> bodies = Lists.newArrayList();
    // push the event batches into channel
    for (i = 0; i < numBatches; i++) {
        Transaction txn = channel.getTransaction();
        txn.begin();
        for (j = 1; j <= batchSize; j++) {
            Event event = new SimpleEvent();
            eventDate.clear();
            eventDate.set(2011, i, i, i, 0); // yy mm dd
            event.getHeaders().put("timestamp", String.valueOf(eventDate.getTimeInMillis()));
            event.getHeaders().put("hostname", "Host" + i);
            event.getHeaders().put("slow", "1500");
            String body = "Test." + i + "." + j;
            event.setBody(body.getBytes());
            bodies.add(body);
            channel.put(event);
            totalEvents++;
        }
        txn.commit();
        txn.close();

        // execute sink to process the events
        sink.process();
    }

    sink.stop();

    // loop through all the files generated and check their contains
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    // check that the roll happened correctly for the given data
    // Note that we'll end up with two files with only a head
    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0)
        expectedFiles++;
    Assert.assertEquals("num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    verifyOutputSequenceFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
}

From source file:org.apache.flume.sink.customhdfs.TestHDFSEventSink.java

License:Apache License

@Test
public void testCloseOnIdle() throws IOException, EventDeliveryException, InterruptedException {
    String hdfsPath = testPath + "/idleClose";

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(hdfsPath);
    fs.delete(dirPath, true);/*from  www.  ja v  a2s .c  o m*/
    fs.mkdirs(dirPath);
    Context context = new Context();
    context.put("hdfs.path", hdfsPath);
    /*
     * All three rolling methods are disabled so the only
     * way a file can roll is through the idle timeout.
     */
    context.put("hdfs.rollCount", "0");
    context.put("hdfs.rollSize", "0");
    context.put("hdfs.rollInterval", "0");
    context.put("hdfs.batchSize", "2");
    context.put("hdfs.idleTimeout", "1");
    Configurables.configure(sink, context);

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);

    sink.setChannel(channel);
    sink.start();

    Transaction txn = channel.getTransaction();
    txn.begin();
    for (int i = 0; i < 10; i++) {
        Event event = new SimpleEvent();
        event.setBody(("test event " + i).getBytes());
        channel.put(event);
    }
    txn.commit();
    txn.close();

    sink.process();
    sink.process();
    Thread.sleep(1001);
    // previous file should have timed out now
    // this can throw BucketClosedException(from the bucketWriter having
    // closed),this is not an issue as the sink will retry and get a fresh
    // bucketWriter so long as the onClose handler properly removes
    // bucket writers that were closed.
    sink.process();
    sink.process();
    Thread.sleep(500); // shouldn't be enough for a timeout to occur
    sink.process();
    sink.process();
    sink.stop();
    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path[] fList = FileUtil.stat2Paths(dirStat);
    Assert.assertEquals("Incorrect content of the directory " + StringUtils.join(fList, ","), 2, fList.length);
    Assert.assertTrue(!fList[0].getName().endsWith(".tmp") && !fList[1].getName().endsWith(".tmp"));
    fs.close();
}

From source file:org.apache.giraph.aggregators.VarianceAggregatorDynamic.java

License:Apache License

@Override
public void aggregate(AggregateMessageCustome value) {

    //int megaStepInd = value.getMegaSlotInd();
    //if(value.getTweetList().get(0).getId() == -1)
    //   return;/*from   w w  w  . j  a va 2 s .  com*/
    //if(neighborList == null)
    //   neighborList = new ArrayList<int[]> ();
    //if(simList == null)
    //   variance = new ArrayList<float> ();
    //if(tweets == null)
    //   tweets = new ArrayList<Tweet>();
    LinkedList<Tweet> tweetList = value.getTweetList();
    LinkedList<Tweet> newList = new LinkedList<Tweet>();

    if (tweetList.size() == 1) { // worker aggregate
        AggregateMessageCustome prev = getAggregatedValue();
        LinkedList<Tweet> newList1 = prev.getTweetList();
        Tweet currTweet = tweetList.get(0);
        boolean checkNext = false;
        //LinkedList<Tweet> missedTweets = new LinkedList<Tweet>();
        for (Tweet tweet : newList1) {
            if (newList.size() == factor * topicNumber) {
                break;
            }
            if (!checkNext) {
                if (tweet.getVariance() >= currTweet.getVariance()) {
                    if (tweet.getConflictList().contains(currTweet.getId())) {
                        newList = newList1;
                        break;
                    }
                    newList.add(tweet);
                } else { // should add the currTweet
                    newList.add(currTweet);
                    checkNext = true;
                    if (newList.size() == factor * topicNumber) {
                        break;
                    }
                    if (!currTweet.getConflictList().contains(tweet.getId())) {
                        newList.add(tweet);
                    } // else {
                      //   missedTweets.add(tweet);
                      //}
                }
            } else {
                if (!currTweet.getConflictList().contains(tweet.getId())) {
                    newList.add(tweet);
                } //else {
                  //missedTweets.add(tweet);
                  //}
            }
        }
        if ((newList.size() < factor * topicNumber) && !checkNext) {
            newList.add(currTweet);
        }
        //Iterator<Tweet> missedIter = missedTweets.iterator();
        //while (newList.size() < factor * topicNumber) {
        //   if(!missedIter.hasNext())
        //      break;
        //   newList.add(missedIter.next());
        //}
    }
    if (tweetList.size() > 1) { // master aggregate
        AggregateMessageCustome prev = getAggregatedValue();
        newList = prev.getTweetList();
        for (Tweet tweet : tweetList) {
            newList.add(tweet);
        }
        stage++;
        System.out.println("stage value = " + stage);
    }

    getAggregatedValue().setTweetList(newList);
    //getAggregatedValue().setMegaSlotInd(megaStepInd);

    if (tweetList.size() > 1 && (stage % workerNum == 0) && (stage != 0)) {
        try {
            //System.out.println("About to write in the aggregator");
            //Path pt = new Path("/user/exp/ahmed/20k_10k/output/timeslot-0");
            //int numMegaStep = (stage/workerNum) - 1;

            FileSystem fs = FileSystem.get(new Configuration());
            FileStatus[] fileStatus = fs.listStatus(new Path("/user/exp/ahmed/50k_sliding_20k_output"));
            Path[] paths = FileUtil.stat2Paths(fileStatus);
            int numMegaStep = paths.length;
            //System.out.println("Current numMegaStep = "+numMegaStep);
            String outFileName = "/user/exp/ahmed/50k_sliding_20k_output/timeslot-" + numMegaStep + ".txt";
            Path pt = new Path(outFileName);
            System.out.println("Writing output to " + outFileName + ". pt = " + pt);
            BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true)));
            Collections.sort(newList);
            /*
            String varFileName = "/user/exp/ahmed/20k_sliding_10k_output/var-"+ numMegaStep + ".txt";
            Path varpt = new Path(varFileName);
            System.out.println("Writing output to " + varFileName + ". pt = " + varpt);
            BufferedWriter varbr = new BufferedWriter(new OutputStreamWriter(fs.create(varpt,true)));
                    
            for(Tweet tweet : newList){ 
               varbr.write(tweet.getId()+"\t"+tweet.getVariance());
               varbr.newLine();
            }
            varbr.close();
            */
            HashSet<Integer> selectedTweets = new HashSet<Integer>();
            int lastTopic = -1;
            int topicCount = 0;
            Iterator<Tweet> tweetItr = newList.iterator();
            Tweet nextTweet = tweetItr.next();

            Loop: while (topicCount < topicNumber) {
                while ((nextTweet.getConflictList().contains(lastTopic)) && (lastTopic != -1)) {
                    if (!tweetItr.hasNext())
                        break Loop;
                    nextTweet = tweetItr.next();
                }
                lastTopic = nextTweet.getId();
                selectedTweets.add(lastTopic);
                if (!tweetItr.hasNext())
                    break;
                nextTweet = tweetItr.next();
                topicCount++;
            }
            // get tweet text
            String fileName = "/user/exp/ahmed/50k_sliding_20k_dynamic_tweets/tweets-" + numMegaStep;
            Path ptRead = new Path(fileName);
            FSDataInputStream reader = fs.open(ptRead);
            System.out.println("In agg. Getting tweet text from " + fileName + ". reader = " + reader);
            String line = null;
            //System.out.println("Reader created " + reader + ", content = " + reader.readLine());
            while ((line = reader.readLine()) != null) {
                String[] splits = line.split("\t");
                int id = Integer.parseInt(splits[0]);
                //System.out.println("About to write");
                if (selectedTweets.contains(id) && splits.length >= 2) {
                    //System.out.println(splits[1]);
                    br.write(splits[1]);
                    br.newLine();
                }
            }
            reader.close();
            br.close();
        } catch (IOException ex) {
            System.out.println("In aggregator: " + ex.getMessage());
        }
    }

    //vertexes.add(vertexId);
    //neighborList.add(neighbor);
    //variance.add(value.getVar());
    //System.out.println(tweets.size());

}

From source file:org.apache.hama.bsp.CombineFileInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(BSPJob bspJob, int numSplits) throws IOException {

    Configuration job = bspJob.getConfiguration();

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;

    // the values specified by setxxxSplitSize() takes precedence over the
    // values that might have been specified in the config
    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*w  w  w.java 2 s. c  o  m*/
    } else {
        minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = job.getLong("mapred.max.split.size", 0);
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split size per rack " + minSizeRack);
    }

    // all the files in input set
    Path[] paths = FileUtil.stat2Paths(listStatus(bspJob));
    List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
    if (paths.length == 0) {
        return splits.toArray(new CombineFileSplit[splits.size()]);
    }

    // In one single iteration, process all the paths in a single pool.
    // Processing one pool at a time ensures that a split contans paths
    // from a single pool only.
    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        // pick one input path. If it matches all the filters in a pool,
        // add it to the output set
        for (int i = 0; i < paths.length; i++) {
            if (paths[i] == null) { // already processed
                continue;
            }
            Path p = new Path(paths[i].toUri().getPath());
            if (onepool.accept(p)) {
                myPaths.add(paths[i]); // add it to my output set
                paths[i] = null; // already processed
            }
        }
        // create splits for all files in this pool.
        getMoreSplits(bspJob, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    // Finally, process all paths that do not belong to any pool.
    ArrayList<Path> myPaths = new ArrayList<Path>();
    for (Path path : paths) {
        if (path == null) { // already processed
            continue;
        }
        myPaths.add(path);
    }
    // create splits for all files that are not in any pool.
    getMoreSplits(bspJob, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits);

    // free up rackToNodes map
    rackToNodes.clear();
    return splits.toArray(new CombineFileSplit[splits.size()]);
}

From source file:org.apache.mahout.classifier.sequencelearning.baumwelchmapreduce.BaumWelchUtils.java

License:Apache License

public static HmmModel CreateHmmModel(int nrOfHiddenStates, int nrOfOutputStates, Path modelPath,
        Configuration conf) throws IOException {

    log.info("Entering Create Hmm Model. Model Path = {}", modelPath.toUri());
    Vector initialProbabilities = new DenseVector(nrOfHiddenStates);
    Matrix transitionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfHiddenStates);
    Matrix emissionMatrix = new DenseMatrix(nrOfHiddenStates, nrOfOutputStates);

    // Get the path location where the seq files encoding model are stored
    Path modelFilesPath = new Path(modelPath, "*");
    log.info("Create Hmm Model. ModelFiles Path = {}", modelFilesPath.toUri());
    Collection<Path> result = new ArrayList<Path>();

    // get all filtered file names in result list
    FileSystem fs = modelFilesPath.getFileSystem(conf);
    log.info("Create Hmm Model. File System = {}", fs);
    FileStatus[] matches = fs.listStatus(
            FileUtil.stat2Paths(fs.globStatus(modelFilesPath, PathFilters.partFilter())),
            PathFilters.partFilter());//from w  ww .  j  a  v a2 s  .  c  o  m

    for (FileStatus match : matches) {
        log.info("CreateHmmmModel Adding File Match {}", match.getPath().toString());
        result.add(fs.makeQualified(match.getPath()));
    }

    // iterate through the result path list
    for (Path path : result) {
        for (Pair<Writable, MapWritable> pair : new SequenceFileIterable<Writable, MapWritable>(path, true,
                conf)) {
            Text key = (Text) pair.getFirst();
            log.info("CreateHmmModel Matching Seq File Key = {}", key);
            MapWritable valueMap = pair.getSecond();
            if (key.charAt(0) == 'I') {
                // initial distribution stripe
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Initial Prob Adding  Key, Value  = ({} {})",
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get());
                    initialProbabilities.set(((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'T') {
                // transition distribution stripe
                // key is of the form TRANSIT_0, TRANSIT_1 etc
                // the number after _ is the state ID at char number 11
                int stateID = Character.getNumericValue(key.charAt(8));
                log.info("CreateHmmModel stateID = key.charAt(8) = {}", stateID);
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Transition Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    transitionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else if (key.charAt(0) == 'E') {
                // emission distribution stripe
                // key is of the form EMIT_0, EMIT_1 etc
                // the number after _ is the state ID at char number 5
                int stateID = Character.getNumericValue(key.charAt(5));
                for (MapWritable.Entry<Writable, Writable> entry : valueMap.entrySet()) {
                    log.info("CreateHmmModel Emission Matrix ({}, {}) = {}", new Object[] { stateID,
                            ((IntWritable) entry.getKey()).get(), ((DoubleWritable) entry.getValue()).get() });
                    emissionMatrix.set(stateID, ((IntWritable) entry.getKey()).get(),
                            ((DoubleWritable) entry.getValue()).get());
                }
            } else {
                throw new IllegalStateException("Error creating HmmModel from Sequence File Path");
            }
        }
    }
    HmmModel model = new HmmModel(transitionMatrix, emissionMatrix, initialProbabilities);
    HmmUtils.validate(model);
    return model;
}