org.mfcrawler.model.export.gexf.ExportPagesGexf.java Source code

Java tutorial

Introduction

Here is the source code for org.mfcrawler.model.export.gexf.ExportPagesGexf.java

Source

/*
Mini Focused Crawler : focused web crawler with a simple GUI
Copyright (C) 2013  lbertelo
    
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
    
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.mfcrawler.model.export.gexf;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.lang3.StringEscapeUtils;
import org.mfcrawler.model.dao.export.ExportPageDAO;
import org.mfcrawler.model.dao.iterator.LinkDbIterator;
import org.mfcrawler.model.dao.iterator.PageDbIterator;
import org.mfcrawler.model.pojo.site.Page;
import org.mfcrawler.model.pojo.site.link.Link;
import org.mfcrawler.model.util.ConversionUtils;
import org.mfcrawler.model.util.I18nUtil;

/**
 * Exports pages in a GEXF file pages are nodes and links are edges
 * 
 * @author lbertelo
 */
public final class ExportPagesGexf {

    /**
     * Date format for the GEXF header
     */
    private static final String DATE_FORMAT = "yyyy-MM-dd";

    /**
     * Datetime format for nodes
     */
    private static final String DATETIME_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS";

    /**
     * Private constructor
     */
    private ExportPagesGexf() {
    }

    // Main

    /**
     * Exports pages with a minimum score in a GEXF file
     * @param file the file used to export
     * @param minScore minimum score for a page
     */
    public static void export(File file, Double minScore) {
        try {
            FileWriter gexfFileWriter = new FileWriter(file, false);

            writeGexfHeader(gexfFileWriter);

            generatePagesNodes(gexfFileWriter, minScore);

            generateLinksEdges(gexfFileWriter, minScore);

            writeGexfFooter(gexfFileWriter);

            gexfFileWriter.flush();
            gexfFileWriter.close();
        } catch (IOException ie) {
            Logger.getLogger(ExportSitesGexf.class.getName()).log(Level.SEVERE, "Error to write gexf", ie);
        }
    }

    /**
     * Writes the GEXF header
     * @param fileWriter the file write
     * @exception
     */
    private static void writeGexfHeader(FileWriter fileWriter) throws IOException {
        fileWriter.write("<gexf xmlns=\"http://www.gexf.net/1.2draft\" version=\"1.2\" ");
        fileWriter.write(
                "xmlns:viz=\"http://www.gexf.net/1.2draft/viz\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" ");
        fileWriter.write(
                "xsi:schemaLocation=\"http://www.gexf.net/1.2draft http://www.gexf.net/1.2draft/gexf.xsd\"> \n");
        fileWriter.write("<meta lastmodifieddate=\"");
        fileWriter.write(ConversionUtils.toFormattedDate(new Date(), DATE_FORMAT));
        fileWriter.write("\"> \n");
        fileWriter.write("<creator> ");
        fileWriter.write(I18nUtil.getMessage("export.creator"));
        fileWriter.write(" </creator> \n <description> ");
        fileWriter.write(I18nUtil.getMessage("export.descriptionGexf"));
        fileWriter.write(" </description> \n");
        fileWriter.write(
                "</meta> \n <graph defaultedgetype=\"directed\" idtype=\"string\" mode=\"dynamic\" timeformat=\"dateTime\"> ");
        fileWriter.write("<attributes class=\"node\" mode=\"static\"> ");
        fileWriter.write("<attribute id=\"score\" title=\"score\" type=\"double\" /> ");
        fileWriter.write("<attribute id=\"innerDeep\" title=\"innerDeep\" type=\"integer\" /> ");
        fileWriter.write(
                "<attribute id=\"outerDeep\" title=\"outerDeep\" type=\"integer\" /> </attributes> \"> \n\n");
    }

    /**
     * Writes the GEXF footer
     * @param fileWriter the file write
     * @exception
     */
    private static void writeGexfFooter(FileWriter fileWriter) throws IOException {
        fileWriter.write("\n </graph> \n </gexf>");
    }

    // Nodes

    /**
     * Gets list of pages (with a minimum score) to export and write nodes
     * @param fileWriter the file writer
     * @param minScore the minimum score
     * @exception
     */
    private static void generatePagesNodes(FileWriter fileWriter, Double minScore) throws IOException {
        ExportPageDAO exportPageDao = new ExportPageDAO();
        PageDbIterator pageIterator = exportPageDao.getPageListToExport(minScore);

        fileWriter.write("\n<nodes>\n");
        while (pageIterator.hasNext()) {
            Page page = pageIterator.next();
            writeGexfNode(fileWriter, page);
        }
        fileWriter.write("\n</nodes>\n");
    }

    /**
     * Writes a gexf node from a page
     * @param fileWriter the file write
     * @param site the page (the node)
     * @exception
     */
    private static void writeGexfNode(FileWriter fileWriter, Page page) throws IOException {
        String linkName = StringEscapeUtils.escapeXml(page.getLink().getUrl());
        String titleName = StringEscapeUtils.escapeXml(page.getTitle());
        fileWriter.write("<node id=\"");
        fileWriter.write(linkName);
        fileWriter.write("\" label=\"");
        fileWriter.write(titleName);
        fileWriter.write("\" start=\"");
        fileWriter.write(ConversionUtils.toFormattedDate(page.getCrawlTime(), DATETIME_FORMAT));

        fileWriter.write("\"> <attvalues> <attvalue for=\"score\" value=\"");
        fileWriter.write(ConversionUtils.toString(page.getScore()));
        fileWriter.write("\"/> <attvalue for=\"innerDeep\" value=\"");
        fileWriter.write(ConversionUtils.toString(page.getInnerDeep()));
        fileWriter.write("\"/> <attvalue for=\"outerDeep\" value=\"");
        fileWriter.write(ConversionUtils.toString(page.getOuterDeep()));
        fileWriter.write("\"/> </attvalues> </node> \n");
    }

    // Edges

    /**
     * Gets list of pages and links to export and write edges
     * @param fileWriter the file writer
     * @param minScore the minimum score
     * @exception
     */
    private static void generateLinksEdges(FileWriter fileWriter, Double minScore) throws IOException {
        fileWriter.write("\n<edges>\n");
        int edgeId = 0;

        ExportPageDAO exportPageDao = new ExportPageDAO();
        LinkDbIterator linkIterator = exportPageDao.getSourceLinkList(minScore);

        while (linkIterator.hasNext()) {
            Link sourceLink = linkIterator.next();
            List<Link> targetLinkList = exportPageDao.getTargetLinkList(sourceLink, minScore);
            for (Link targetLink : targetLinkList) {
                writeGexfEdge(fileWriter, edgeId++, sourceLink, targetLink);
            }
        }

        fileWriter.write("\n</edges>\n");
    }

    /**
     * Writes a gexf edge from a source link (correct page) and a target link
     * @param fileWriter the file write
     * @param edgeId the edge id
     * @param sourceLink the source link, the source of the edge
     * @param targetLink the outgoing link, the target of the edge
     * @exception
     */
    private static void writeGexfEdge(FileWriter fileWriter, int edgeId, Link sourceLink, Link targetLink)
            throws IOException {
        fileWriter.write("<edge id=\"");
        fileWriter.write(ConversionUtils.toString(edgeId));
        fileWriter.write("\" source=\"");
        fileWriter.write(StringEscapeUtils.escapeXml(sourceLink.getUrl()));
        fileWriter.write("\" target=\"");
        fileWriter.write(StringEscapeUtils.escapeXml(targetLink.getUrl()));
        fileWriter.write("\" type=\"directed\" /> \n");
    }

}