Example usage for org.apache.hadoop.io LongWritable get

List of usage examples for org.apache.hadoop.io LongWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io LongWritable get.

Prototype

public long get() 

Source Link

Document

Return the value of this LongWritable.

Usage

From source file:nl.gridline.zieook.statistics.SourcesByRecommenderReduce.java

License:Apache License

@Override
protected void reduce(RecommenderSource key, Iterable<LongWritable> values, Context context)
        throws IOException, InterruptedException {
    int count = 0;
    for (LongWritable value : values) {
        count += value.get();
    }/*from ww  w  . ja  v  a  2  s  .co m*/

    Put put = HBasePUTFactory.putStatisticsSources(key.getRecommender(), key.getSource(), date, count);

    LOG.info("reduce <[" + key.getRecommender() + "," + key.getSource() + "]," + count + ">");
    context.write(key, put);
}

From source file:nl.gridline.zieook.statistics.ViewsPerItemReduce.java

License:Apache License

@Override
protected void reduce(RecommenderItem key, Iterable<LongWritable> values, Context context)
        throws IOException, InterruptedException {

    int count = 0;
    for (LongWritable value : values) {
        count += value.get();
    }//from  w  w w .j a  v a2s  . c  o m

    Put put = HBasePUTFactory.putStatisticsItemViews(key.getRecommender(), key.getItem(), date, count);

    LOG.info("reduce <[" + key.getRecommender() + "," + key.getItem() + "]," + count + ">");
    context.write(key, put);

}

From source file:nl.surfsara.warcexamples.datascience.WordCountMapper.java

License:Apache License

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    // Only process http response content. Note that the outlinks can also be found in the wat metadata.
    if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) {
        // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us:
        HttpHeader httpHeader = value.getHttpHeader();
        if (httpHeader == null) {
            // No header so we are unsure that the content is text/html: NOP
        } else {/*  w w  w . ja v  a  2  s . c  o m*/
            if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) {
                // Note that if you really want to do this right; you should look at the character encoding as well.
                // We'll leave that as an exercise for you ;-).
                context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1);
                // Get the html payload
                Payload payload = value.getPayload();
                if (payload == null) {
                    // NOP
                } else {
                    String warcContent = IOUtils.toString(payload.getInputStreamComplete());
                    if (warcContent == null && "".equals(warcContent)) {
                        // NOP
                    } else {
                        try {
                            //Remove all HTML from warcContent, right now it seems to empty the pages completely. Please do test for yourself
                            //warcContent = Jsoup.parse(warcContent).text();
                        } catch (Exception e) {
                        }

                        String targetURI = value.header.warcTargetUriStr;
                        //Write Word Count Mapping
                        context.write(new Text(targetURI), new Text(parseToString(countWords(warcContent))));

                        //Write Links
                        Document doc = Jsoup.parse(warcContent);
                        Elements links = doc.select("a");
                        for (Element link : links) {
                            String absHref = link.attr("abs:href");
                            // Omit nulls and empty strings
                            if (absHref != null && !("".equals(absHref))) {
                                context.write(new Text(targetURI), new Text(absHref));
                            }
                        }
                    }
                }
            }
        }
    }
}

From source file:nl.surfsara.warcexamples.hadoop.ldps.LDPRMapper.java

License:Apache License

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    String targetURI = value.header.warcTargetUriStr;

    if (null == targetURI || "".equals(targetURI)) {
        context.getCounter(Counters.NUM_NO_IP).increment(1);
    } else {//from   w ww. ja va  2  s .c  o  m
        try {
            warcTargetUriStrHost = new URI(targetURI).getHost();
        } catch (URISyntaxException e) {
            logger.error(e);
        }
    }

    // Only process text/plain content
    if ("text/plain".equals(value.header.contentTypeStr)
            && !(prevWarcTargetUriStr.equals(warcTargetUriStrHost))) {
        context.getCounter(Counters.NUM_TEXT_RECORDS).increment(1);
        // Get the text payload
        Payload payload = value.getPayload();

        prevWarcTargetUriStr = warcTargetUriStrHost;

        if (payload == null || temp > 50) {
            // NOP
        } else {

            String warcContent = IOUtils.toString(payload.getInputStreamComplete());
            if (warcContent == null && "".equals(warcContent)) {
                // NOP
            } else {
                // Classify text
                //               warcContent = warcContent.substring(0, Math.min(warcContent.length(), MAXLEN));

                int i = warcContent.indexOf(' ');
                if (i > 0) {
                    String word = warcContent.substring(0, i);
                    //                  context.write(new Text(word), new LongWritable(3));   
                    logger.info("LOGGER: tmpdir: " + word);
                    LOG.info("LOG: tmpdir: " + word);
                }

                try {
                    Detector detector = DetectorFactory.create();
                    detector.append(warcContent);
                    context.write(new Text(detector.detect()), new LongWritable(1));
                } catch (LangDetectException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                    logger.error(e);
                }
            }
            temp++;
        }
    }
}

From source file:nl.surfsara.warcexamples.hadoop.rr.RRMapper.java

License:Apache License

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    // Only process text/plain content
    if ("text/plain".equals(value.header.contentTypeStr)) {
        context.getCounter(Counters.NUM_TEXT_RECORDS).increment(1);
        // Get the text payload
        Payload payload = value.getPayload();

        if (payload == null) {
            // NOP
        } else {/*from  w  w  w .j  av  a2s .c  o m*/

            String warcContent = IOUtils.toString(payload.getInputStreamComplete());

            if (warcContent == null && "".equals(warcContent)) {
                // NOP
            } else {

                // warcContent is the text content of a site. Do something with it!
                results = new ArrayList<String>();

                Chunking chunking = dictionaryChunkerFT.chunk(warcContent);
                for (Chunk chunk : chunking.chunkSet()) {
                    int start = chunk.start();
                    int end = chunk.end();
                    String type = chunk.type();
                    String recording = warcContent.substring(start, end);
                    if (recording.length() > 1) {
                        String[] correspondingArtist = songMap.get(recording);
                        //                         
                        //                         System.out.println("Recording: "+recording);
                        //                         System.out.println(Arrays.toString(correspondingArtist));

                        if (null != correspondingArtist && correspondingArtist.length > 0) {
                            for (String artist : correspondingArtist) {

                                String contentSubstring = warcContent.substring(Math.max(0, start - 300), start)
                                        + warcContent.substring(end, Math.min(end + 300, warcContent.length()));

                                // 
                                // contentSubstring.toLowerCase().contains(artist.toLowerCase()
                                //       System.out.println(tmp.matches(".*\\bpro\\b.*"));
                                if (null != contentSubstring && null != artist) {
                                    if (contentSubstring.toLowerCase().contains(artist.toLowerCase())) {

                                        contentSubstring = contentSubstring.toLowerCase()
                                                .replaceAll("(\\r|\\n)", " ");

                                        if (contentSubstring
                                                .matches(".*\\b" + artist.toLowerCase() + "\\b.*")) {
                                            String resultKey = type + ";" + artist;

                                            context.getCounter(Counters.NUM_SONGS_DETECTED).increment(1);

                                            if (!results.contains(resultKey)) {
                                                //                                              System.out.println(resultKey);
                                                results.add(resultKey);
                                                context.write(new Text(value.header.warcRefersToStr),
                                                        new Text(resultKey));
                                            }
                                        } else {
                                            logger.info(artist + ", " + contentSubstring);
                                        }

                                    }

                                }
                            }
                        }
                    }
                }

            }
        }
    }
}

From source file:nl.surfsara.warcexamples.hadoop.warc.HrefExtracter.java

License:Apache License

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    // Only process http response content. Note that the outlinks can also be found in the wat metadata.
    if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) {
        // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us:
        HttpHeader httpHeader = value.getHttpHeader();
        if (httpHeader == null) {
            // No header so we are unsure that the content is text/html: NOP
        } else {//from   w  w w  .ja  v  a2  s. co m
            if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) {
                // Note that if you really want to do this right; you should look at the character encoding as well.
                // We'll leave that as an exercise for you ;-).
                context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1);
                // Get the html payload
                Payload payload = value.getPayload();
                if (payload == null) {
                    // NOP
                } else {
                    String warcContent = IOUtils.toString(payload.getInputStreamComplete());
                    if (warcContent == null || "".equals(warcContent)) {
                        // NOP
                    } else {
                        String targetURI = value.header.warcTargetUriStr;

                        Document doc = Jsoup.parse(warcContent);

                        Elements links = doc.select("a");
                        for (Element link : links) {
                            String absHref = link.attr("abs:href");
                            // Omit nulls and empty strings
                            if (absHref != null && !("".equals(absHref))) {
                                context.write(new Text(targetURI), new Text(absHref));
                            }
                        }
                    }
                }
            }
        }
    }
}

From source file:nl.surfsara.warcexamples.hadoop.wat.ServerTypeExtracter.java

License:Apache License

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    // Only try to parse json content
    if ("application/json".equals(value.header.contentTypeStr)) {
        context.getCounter(Counters.NUM_JSON_RECORDS).increment(1);
        // Get the json payload
        Payload payload = value.getPayload();
        if (payload == null) {
            // NOP
        } else {//from   w  ww . j  a  v  a2 s.co m
            String warcContent = IOUtils.toString(payload.getInputStreamComplete());
            JSONObject json = new JSONObject(warcContent);
            try {
                String server = json.getJSONObject("Envelope").getJSONObject("Payload-Metadata")
                        .getJSONObject("HTTP-Response-Metadata").getJSONObject("Headers").getString("Server");
                context.write(new Text(server), new LongWritable(1));
            } catch (JSONException e) {
                // Not the JSON we were looking for.. 
                logger.error(e);
            }
        }
    }
}

From source file:nl.surfsara.warcexamples.hadoop.wet.NERMapper.java

License:Apache License

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());
    // Only process text/plain content
    if ("text/plain".equals(value.header.contentTypeStr)) {
        context.getCounter(Counters.NUM_TEXT_RECORDS).increment(1);
        // Get the text payload
        Payload payload = value.getPayload();
        if (payload == null) {
            // NOP
        } else {//from  ww w.j  a  va2s .c  o m
            if (numrecords < MAX_RECORDS) {
                String warcContent = IOUtils.toString(payload.getInputStreamComplete());
                if (warcContent == null && "".equals(warcContent)) {
                    // NOP
                } else {
                    // Classify text      
                    List<List<CoreLabel>> classify = classifier.classify(warcContent);
                    for (List<CoreLabel> coreLabels : classify) {
                        for (CoreLabel coreLabel : coreLabels) {
                            String term = coreLabel.word();
                            String tag = coreLabel.get(CoreAnnotations.AnswerAnnotation.class);

                            if (!"O".equals(tag)) {

                                context.write(new Text(tag), new Text(term));
                            }
                        }
                    }
                    numrecords++;
                }
            }
        }
    }
}

From source file:nl.tudelft.graphalytics.giraph.algorithms.cdlp.CommonCommunityDetectionLPComputation.java

License:Apache License

/**
 * Chooses new label by finding the most frequent label and picking the lowest id amongst the most frequent.
 *//*from   w  w  w . j a  va2 s  .co m*/
private void determineLabel(Vertex<LongWritable, LongWritable, E> vertex,
        Iterable<LongWritable> incomingLabels) {
    // Compute for each incoming label the aggregate and maximum scores
    labelOccurences.clear();
    labelOccurences.defaultReturnValue(0L);
    for (LongWritable incomingLabel : incomingLabels) {
        long label = incomingLabel.get();
        labelOccurences.put(label, labelOccurences.get(label) + 1);
    }

    // Find the label with the highest frequency score (primary key) and lowest id (secondary key)
    long bestLabel = 0;
    long highestFrequency = 0;
    for (Long2LongMap.Entry labelFrequency : labelOccurences.long2LongEntrySet()) {
        long label = labelFrequency.getLongKey();
        long frequency = labelFrequency.getLongValue();
        if (frequency > highestFrequency || (frequency == highestFrequency && label < bestLabel)) {
            bestLabel = label;
            highestFrequency = frequency;
        }
    }

    // Update the label of this vertex
    vertex.getValue().set(bestLabel);
}

From source file:nl.tudelft.graphalytics.giraph.algorithms.cdlp.DirectedCommunityDetectionLPComputation.java

License:Apache License

@Override
protected void doInitialisationStep(Vertex<LongWritable, LongWritable, BooleanWritable> vertex,
        Iterable<LongWritable> messages) {
    if (getSuperstep() == 0) {
        // Send vertex id to outgoing neighbours, so that all vertices know their incoming edges.
        sendMessageToAllEdges(vertex, vertex.getId());
    } else {/*from   ww w  .ja va 2 s .  c om*/
        // Store incoming messages (vertex ids) in a set
        LongSet messageSet = new LongOpenHashSet();
        for (LongWritable message : messages) {
            messageSet.add(message.get());
        }
        // Update the value of existing edges
        for (MutableEdge<LongWritable, BooleanWritable> edge : vertex.getMutableEdges()) {
            long targetVertexId = edge.getTargetVertexId().get();
            if (messageSet.contains(targetVertexId)) {
                messageSet.remove(targetVertexId);
                edge.getValue().set(BIDIRECTIONAL);
            }
        }
        // Create new unidirectional edges to match incoming edges
        for (LongIterator messageIter = messageSet.iterator(); messageIter.hasNext();) {
            long newEdge = messageIter.nextLong();
            vertex.addEdge(EdgeFactory.create(new LongWritable(newEdge), UNIDIRECTIONAL_EDGE));
        }

        // Set the initial label of the vertex
        vertex.getValue().set(vertex.getId().get());
    }
}