de.gesundkrank.wikipedia.hadoop.WikiRevisionWritable.java Source code

Java tutorial

Introduction

Here is the source code for de.gesundkrank.wikipedia.hadoop.WikiRevisionWritable.java

Source

/*
 * This file is part of wikipedia-hadoop.
 * Copyright (C) 2015 Jan Graegger
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see [http://www.gnu.org/licenses/].
 */

package de.gesundkrank.wikipedia.hadoop;

import com.google.common.base.Strings;
import org.apache.hadoop.io.WritableComparable;
import org.wikiclean.WikiClean;
import org.wikiclean.WikiCleanBuilder;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Locale;

/**
 * @author Jan Graegger<jan@anycook.de>
 */
public class WikiRevisionWritable implements WritableComparable<WikiRevisionWritable> {

    private static WikiClean cleaner;
    private static Locale cleanerLocale;

    private long id;
    private long timestamp;
    private WikiPageWritable page;
    private WikiRevisionContributor contributor;
    private String comment;
    private String text;
    private boolean isMinor = false;

    public WikiRevisionWritable() {
        this(null);
    }

    public WikiRevisionWritable(WikiPageWritable page) {
        this.page = page;
    }

    public long getId() {
        return id;
    }

    public void setId(long id) {
        this.id = id;
    }

    public long getTimestamp() {
        return timestamp;
    }

    public void setTimestamp(long timestamp) {
        this.timestamp = timestamp;
    }

    public WikiPageWritable getPage() {
        return page;
    }

    public void setPage(WikiPageWritable page) {
        this.page = page;
    }

    public WikiRevisionContributor getContributor() {
        return contributor;
    }

    public void setContributor(WikiRevisionContributor contributor) {
        this.contributor = contributor;
    }

    public String getComment() {
        return Strings.nullToEmpty(comment);
    }

    public void setComment(String comment) {
        this.comment = comment;
    }

    public String getText() {
        return Strings.nullToEmpty(text);
    }

    public String getPlainText(Locale locale) {
        if (cleaner == null || cleanerLocale != locale) {
            cleanerLocale = locale;
            WikiClean.WikiLanguage wikiLanguage;

            if (locale == Locale.CHINESE) {
                wikiLanguage = WikiClean.WikiLanguage.ZH;
            } else if (locale == Locale.GERMAN) {
                wikiLanguage = WikiClean.WikiLanguage.DE;
            } else {
                wikiLanguage = WikiClean.WikiLanguage.EN;
            }

            cleaner = new WikiCleanBuilder().withLanguage(wikiLanguage).build();
        }

        // WikiClean wants to read text from a xml element, so it gets one
        return cleaner.clean("<text xml:space=\"preserve\">" + getText() + "</text>");
    }

    public void setText(String text) {
        this.text = text;
    }

    public boolean isMinor() {
        return isMinor;
    }

    public void setMinor(boolean isMinor) {
        this.isMinor = isMinor;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeLong(id);
        out.writeLong(timestamp);
        page.write(out);

        if (contributor != null) {
            out.writeBoolean(true);
            contributor.write(out);
        } else {
            out.writeBoolean(false);
        }

        out.writeUTF(comment != null ? comment : "");
        out.writeBoolean(isMinor);

        // workaround for "java.io.UTFDataFormatException: encoded string too long"
        if (text != null) {
            out.writeBoolean(true);
            byte[] b = text.getBytes("utf-8");
            out.writeInt(b.length);
            out.write(b);
        } else {
            out.writeBoolean(false);
        }
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        id = in.readLong();
        timestamp = in.readLong();
        page = new WikiPageWritable();
        page.readFields(in);

        if (in.readBoolean()) {
            contributor = new WikiRevisionContributor();
            contributor.readFields(in);
        } else {
            contributor = null;
        }
        comment = in.readUTF();
        isMinor = in.readBoolean();

        int textLength = in.readInt();

        if (in.readBoolean()) {
            byte[] b = new byte[textLength];
            in.readFully(b);
            text = new String(b, "utf-8");
        } else {
            text = null;
        }
    }

    @Override
    public String toString() {
        return String.format(
                "id:%d%n timestamp:%d%n page: %s %n contributor:%s %n comment:%s %n isMinor:%s %n text:%s", id,
                timestamp, page, contributor, comment, isMinor, text);
    }

    @Override
    public int compareTo(WikiRevisionWritable o) {
        return Long.valueOf(id).compareTo(o.id);
    }

}