List of usage examples for org.apache.hadoop.io LongWritable get
public long get()
From source file:nl.gridline.zieook.statistics.SourcesByRecommenderReduce.java
License:Apache License
@Override protected void reduce(RecommenderSource key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (LongWritable value : values) { count += value.get(); }/*from ww w . ja v a 2 s .co m*/ Put put = HBasePUTFactory.putStatisticsSources(key.getRecommender(), key.getSource(), date, count); LOG.info("reduce <[" + key.getRecommender() + "," + key.getSource() + "]," + count + ">"); context.write(key, put); }
From source file:nl.gridline.zieook.statistics.ViewsPerItemReduce.java
License:Apache License
@Override protected void reduce(RecommenderItem key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (LongWritable value : values) { count += value.get(); }//from w w w .j a v a2s . c o m Put put = HBasePUTFactory.putStatisticsItemViews(key.getRecommender(), key.getItem(), date, count); LOG.info("reduce <[" + key.getRecommender() + "," + key.getItem() + "]," + count + ">"); context.write(key, put); }
From source file:nl.surfsara.warcexamples.datascience.WordCountMapper.java
License:Apache License
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only process http response content. Note that the outlinks can also be found in the wat metadata. if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) { // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us: HttpHeader httpHeader = value.getHttpHeader(); if (httpHeader == null) { // No header so we are unsure that the content is text/html: NOP } else {/* w w w . ja v a 2 s . c o m*/ if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) { // Note that if you really want to do this right; you should look at the character encoding as well. // We'll leave that as an exercise for you ;-). context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1); // Get the html payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else { String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null && "".equals(warcContent)) { // NOP } else { try { //Remove all HTML from warcContent, right now it seems to empty the pages completely. Please do test for yourself //warcContent = Jsoup.parse(warcContent).text(); } catch (Exception e) { } String targetURI = value.header.warcTargetUriStr; //Write Word Count Mapping context.write(new Text(targetURI), new Text(parseToString(countWords(warcContent)))); //Write Links Document doc = Jsoup.parse(warcContent); Elements links = doc.select("a"); for (Element link : links) { String absHref = link.attr("abs:href"); // Omit nulls and empty strings if (absHref != null && !("".equals(absHref))) { context.write(new Text(targetURI), new Text(absHref)); } } } } } } } }
From source file:nl.surfsara.warcexamples.hadoop.ldps.LDPRMapper.java
License:Apache License
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); String targetURI = value.header.warcTargetUriStr; if (null == targetURI || "".equals(targetURI)) { context.getCounter(Counters.NUM_NO_IP).increment(1); } else {//from w ww. ja va 2 s .c o m try { warcTargetUriStrHost = new URI(targetURI).getHost(); } catch (URISyntaxException e) { logger.error(e); } } // Only process text/plain content if ("text/plain".equals(value.header.contentTypeStr) && !(prevWarcTargetUriStr.equals(warcTargetUriStrHost))) { context.getCounter(Counters.NUM_TEXT_RECORDS).increment(1); // Get the text payload Payload payload = value.getPayload(); prevWarcTargetUriStr = warcTargetUriStrHost; if (payload == null || temp > 50) { // NOP } else { String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null && "".equals(warcContent)) { // NOP } else { // Classify text // warcContent = warcContent.substring(0, Math.min(warcContent.length(), MAXLEN)); int i = warcContent.indexOf(' '); if (i > 0) { String word = warcContent.substring(0, i); // context.write(new Text(word), new LongWritable(3)); logger.info("LOGGER: tmpdir: " + word); LOG.info("LOG: tmpdir: " + word); } try { Detector detector = DetectorFactory.create(); detector.append(warcContent); context.write(new Text(detector.detect()), new LongWritable(1)); } catch (LangDetectException e) { // TODO Auto-generated catch block e.printStackTrace(); logger.error(e); } } temp++; } } }
From source file:nl.surfsara.warcexamples.hadoop.rr.RRMapper.java
License:Apache License
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only process text/plain content if ("text/plain".equals(value.header.contentTypeStr)) { context.getCounter(Counters.NUM_TEXT_RECORDS).increment(1); // Get the text payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else {/*from w w w .j av a2s .c o m*/ String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null && "".equals(warcContent)) { // NOP } else { // warcContent is the text content of a site. Do something with it! results = new ArrayList<String>(); Chunking chunking = dictionaryChunkerFT.chunk(warcContent); for (Chunk chunk : chunking.chunkSet()) { int start = chunk.start(); int end = chunk.end(); String type = chunk.type(); String recording = warcContent.substring(start, end); if (recording.length() > 1) { String[] correspondingArtist = songMap.get(recording); // // System.out.println("Recording: "+recording); // System.out.println(Arrays.toString(correspondingArtist)); if (null != correspondingArtist && correspondingArtist.length > 0) { for (String artist : correspondingArtist) { String contentSubstring = warcContent.substring(Math.max(0, start - 300), start) + warcContent.substring(end, Math.min(end + 300, warcContent.length())); // // contentSubstring.toLowerCase().contains(artist.toLowerCase() // System.out.println(tmp.matches(".*\\bpro\\b.*")); if (null != contentSubstring && null != artist) { if (contentSubstring.toLowerCase().contains(artist.toLowerCase())) { contentSubstring = contentSubstring.toLowerCase() .replaceAll("(\\r|\\n)", " "); if (contentSubstring .matches(".*\\b" + artist.toLowerCase() + "\\b.*")) { String resultKey = type + ";" + artist; context.getCounter(Counters.NUM_SONGS_DETECTED).increment(1); if (!results.contains(resultKey)) { // System.out.println(resultKey); results.add(resultKey); context.write(new Text(value.header.warcRefersToStr), new Text(resultKey)); } } else { logger.info(artist + ", " + contentSubstring); } } } } } } } } } } }
From source file:nl.surfsara.warcexamples.hadoop.warc.HrefExtracter.java
License:Apache License
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only process http response content. Note that the outlinks can also be found in the wat metadata. if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) { // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us: HttpHeader httpHeader = value.getHttpHeader(); if (httpHeader == null) { // No header so we are unsure that the content is text/html: NOP } else {//from w w w .ja v a2 s. co m if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) { // Note that if you really want to do this right; you should look at the character encoding as well. // We'll leave that as an exercise for you ;-). context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1); // Get the html payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else { String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null || "".equals(warcContent)) { // NOP } else { String targetURI = value.header.warcTargetUriStr; Document doc = Jsoup.parse(warcContent); Elements links = doc.select("a"); for (Element link : links) { String absHref = link.attr("abs:href"); // Omit nulls and empty strings if (absHref != null && !("".equals(absHref))) { context.write(new Text(targetURI), new Text(absHref)); } } } } } } } }
From source file:nl.surfsara.warcexamples.hadoop.wat.ServerTypeExtracter.java
License:Apache License
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only try to parse json content if ("application/json".equals(value.header.contentTypeStr)) { context.getCounter(Counters.NUM_JSON_RECORDS).increment(1); // Get the json payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else {//from w ww . j a v a2 s.co m String warcContent = IOUtils.toString(payload.getInputStreamComplete()); JSONObject json = new JSONObject(warcContent); try { String server = json.getJSONObject("Envelope").getJSONObject("Payload-Metadata") .getJSONObject("HTTP-Response-Metadata").getJSONObject("Headers").getString("Server"); context.write(new Text(server), new LongWritable(1)); } catch (JSONException e) { // Not the JSON we were looking for.. logger.error(e); } } } }
From source file:nl.surfsara.warcexamples.hadoop.wet.NERMapper.java
License:Apache License
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only process text/plain content if ("text/plain".equals(value.header.contentTypeStr)) { context.getCounter(Counters.NUM_TEXT_RECORDS).increment(1); // Get the text payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else {//from ww w.j a va2s .c o m if (numrecords < MAX_RECORDS) { String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null && "".equals(warcContent)) { // NOP } else { // Classify text List<List<CoreLabel>> classify = classifier.classify(warcContent); for (List<CoreLabel> coreLabels : classify) { for (CoreLabel coreLabel : coreLabels) { String term = coreLabel.word(); String tag = coreLabel.get(CoreAnnotations.AnswerAnnotation.class); if (!"O".equals(tag)) { context.write(new Text(tag), new Text(term)); } } } numrecords++; } } } } }
From source file:nl.tudelft.graphalytics.giraph.algorithms.cdlp.CommonCommunityDetectionLPComputation.java
License:Apache License
/** * Chooses new label by finding the most frequent label and picking the lowest id amongst the most frequent. *//*from w w w . j a va2 s .co m*/ private void determineLabel(Vertex<LongWritable, LongWritable, E> vertex, Iterable<LongWritable> incomingLabels) { // Compute for each incoming label the aggregate and maximum scores labelOccurences.clear(); labelOccurences.defaultReturnValue(0L); for (LongWritable incomingLabel : incomingLabels) { long label = incomingLabel.get(); labelOccurences.put(label, labelOccurences.get(label) + 1); } // Find the label with the highest frequency score (primary key) and lowest id (secondary key) long bestLabel = 0; long highestFrequency = 0; for (Long2LongMap.Entry labelFrequency : labelOccurences.long2LongEntrySet()) { long label = labelFrequency.getLongKey(); long frequency = labelFrequency.getLongValue(); if (frequency > highestFrequency || (frequency == highestFrequency && label < bestLabel)) { bestLabel = label; highestFrequency = frequency; } } // Update the label of this vertex vertex.getValue().set(bestLabel); }
From source file:nl.tudelft.graphalytics.giraph.algorithms.cdlp.DirectedCommunityDetectionLPComputation.java
License:Apache License
@Override protected void doInitialisationStep(Vertex<LongWritable, LongWritable, BooleanWritable> vertex, Iterable<LongWritable> messages) { if (getSuperstep() == 0) { // Send vertex id to outgoing neighbours, so that all vertices know their incoming edges. sendMessageToAllEdges(vertex, vertex.getId()); } else {/*from ww w .ja va 2 s . c om*/ // Store incoming messages (vertex ids) in a set LongSet messageSet = new LongOpenHashSet(); for (LongWritable message : messages) { messageSet.add(message.get()); } // Update the value of existing edges for (MutableEdge<LongWritable, BooleanWritable> edge : vertex.getMutableEdges()) { long targetVertexId = edge.getTargetVertexId().get(); if (messageSet.contains(targetVertexId)) { messageSet.remove(targetVertexId); edge.getValue().set(BIDIRECTIONAL); } } // Create new unidirectional edges to match incoming edges for (LongIterator messageIter = messageSet.iterator(); messageIter.hasNext();) { long newEdge = messageIter.nextLong(); vertex.addEdge(EdgeFactory.create(new LongWritable(newEdge), UNIDIRECTIONAL_EDGE)); } // Set the initial label of the vertex vertex.getValue().set(vertex.getId().get()); } }