org.ala.report.GoogleSitemapGenerator.java Source code

Java tutorial

Introduction

Here is the source code for org.ala.report.GoogleSitemapGenerator.java

Source

/***************************************************************************
 * Copyright (C) 2010 Atlas of Living Australia
 * All Rights Reserved.
 *
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ***************************************************************************/
package org.ala.report;

import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Map;

import org.ala.dao.StoreHelper;
import org.ala.model.Classification;
import org.ala.model.CommonName;
import org.ala.model.TaxonConcept;
import org.ala.model.TaxonName;
import org.ala.util.ColumnType;
import org.ala.util.SpringUtils;
import org.apache.log4j.Logger;
import java.util.ArrayList;

import javax.inject.Inject;

import org.apache.commons.lang.StringEscapeUtils;
import org.codehaus.jackson.map.DeserializationConfig;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.map.type.TypeFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.stereotype.Component;

/**
 * GoogleSitemapGenerator.
 * 
 * @author MOK011
 * 
 * History:
 * init version: 10 Sept 2011.
 * 
 * 
 * 
 */
@Component("googleSitemapGenerator")
public class GoogleSitemapGenerator {
    @Inject
    protected StoreHelper storeHelper;
    protected Logger logger = Logger.getLogger(this.getClass());

    public static final int ROWS = 1000;
    public static final String CHARSET_ENCODING = "UTF-8";
    //      public static final String POOL_NAME = "ALA";

    //      private String host = "localhost";
    //      private int port = 9160;
    private String keyspace = "bie";
    private String columnFamily = "tc";
    private ObjectMapper mapper;

    private int urlCtr = 0;
    private int fileNameCtr = 1;

    public static final int MAX_NUMBER_URL = 20000;
    private FileWriter fw = null;
    private String fileName = null;
    public static final String APNI_TAXON = ":apni.taxon:";
    public static final String ADF_TAXON = ":afd.taxon:";

    enum NamePos {
        SCIENTIFIC_NAME, COMMON_NAME, KINGDOM, IS_AUSTRALIAN, NAME_COMPLETE
    }

    //tracking what sci name been add into sitemap that prevent duplicate url.
    private Set<Integer> track = new HashSet<Integer>();

    /**
     * Usage: outputFileName [option: cassandraAddress cassandraPort]
     * 
     * @param args
     */
    public static void main(String[] args) throws Exception {
        ApplicationContext context = SpringUtils.getContext();
        GoogleSitemapGenerator googleSitemapGenerator = context.getBean(GoogleSitemapGenerator.class);

        //check input arguments
        if (args.length == 0) {
            googleSitemapGenerator.setFileName("Sitemap");
        } else if (args.length == 1) {
            googleSitemapGenerator.setFileName(args[0]);
        }
        //              else if (args.length == 2){
        //                      googleSitemapGenerator = new GoogleSitemapGenerator(args[1], 9160);
        //                      googleSitemapGenerator.setFileName(args[0]);
        //              }
        //              else if (args.length == 3){
        //                      googleSitemapGenerator = new GoogleSitemapGenerator(args[1], Integer.parseInt(args[2]));
        //                      googleSitemapGenerator.setFileName(args[0]);
        //              }

        // do sitemap
        try {
            if (googleSitemapGenerator != null) {
                googleSitemapGenerator.doFullScan();
                googleSitemapGenerator.closeConnectionPool();
            } else {
                System.out.println("Invalid input arguments ...." + args);
                System.exit(0);
            }
        } catch (Exception e) {
            System.out.println("***** Fatal Error !!!.... shutdown cassandra connection.");
            e.printStackTrace();
            googleSitemapGenerator.closeConnectionPool();
            System.exit(0);
        }
        System.exit(0);
    }

    public GoogleSitemapGenerator() {
        this("bie", "tc");
    }

    public GoogleSitemapGenerator(String keySpace, String columnFamily) {
        this.keyspace = keySpace;
        this.columnFamily = columnFamily;

        mapper = new ObjectMapper();
        mapper.getDeserializationConfig().set(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false);
    }

    /**
     * close cassandra connection pool.
     */
    public void closeConnectionPool() {
        storeHelper.shutdown();
    }

    /**
     * scan whole columnFamily tree; 
     * plant and other in Australia.
     * 
     * @param infoSourceIds 
     * @throws Exception
     */
    public void doFullScan() throws Exception {
        long start = System.currentTimeMillis();
        //KeySlice startKey = new KeySlice();
        //KeySlice lastKey = null;              

        Date dateNow = new Date();
        SimpleDateFormat dateformat = new SimpleDateFormat("yyMMddHHmm");
        this.setFileName(fileName + dateformat.format(dateNow));

        System.out.println("GoogleSitemapGenerator process is started.....");

        ColumnType[] columns = new ColumnType[] { ColumnType.TAXONCONCEPT_COL, ColumnType.CLASSIFICATION_COL,
                ColumnType.VERNACULAR_COL, ColumnType.IS_AUSTRALIAN, ColumnType.TAXONNAME_COL,

        };
        String lastKey = "";
        String startKey = "";
        Map<String, Map<String, Object>> rowMaps = storeHelper.getPageOfSubColumns(columnFamily, columns, "", ROWS);

        generateURL(rowMaps);

        while (rowMaps.size() > 0) {
            lastKey = rowMaps.keySet().toArray()[rowMaps.size() - 1].toString();
            if (lastKey.equals(startKey)) {
                break;
            }
            startKey = lastKey;
            rowMaps = storeHelper.getPageOfSubColumns(columnFamily, columns, startKey, ROWS);
            generateURL(rowMaps);
        }
        writeFileFooter();
        urlCtr = 0;

        //              ColumnParent columnParent = new ColumnParent(columnFamily);
        //
        //              KeyRange keyRange = new KeyRange(ROWS);
        //              keyRange.setStart_key("");
        //              keyRange.setEnd_key("");
        //
        //              SliceRange sliceRange = new SliceRange();
        //              sliceRange.setStart(new byte[0]);
        //              sliceRange.setFinish(new byte[0]);
        //
        //              SlicePredicate slicePredicate = new SlicePredicate();
        //              slicePredicate.setSlice_range(sliceRange);

        //Client client = Pelops.getDbConnPool(POOL_NAME).getConnection().getAPI();

        // Iterate over all the rows in a ColumnFamily......
        // start with the empty string, and after each call use the last key read as the start key 
        // in the next iteration.
        // when lastKey == startKey is finish.
        //              List<KeySlice> keySlices = client.get_range_slices(keyspace, columnParent, slicePredicate, keyRange, ConsistencyLevel.ONE);             
        //              generateURL(keySlices);
        //              while (keySlices.size() > 0){
        //                      lastKey = keySlices.get(keySlices.size()-1);
        //                      //end of scan ?
        //                      if(lastKey.equals(startKey)){
        //                              writeFileFooter();
        //                              urlCtr = 0;
        //                              break;
        //                      }
        //                      startKey = lastKey;
        //                      keyRange.setStart_key(lastKey.getKey());                        
        //                      keySlices = client.get_range_slices(keyspace, columnParent, slicePredicate, keyRange, ConsistencyLevel.ONE);
        //                      generateURL(keySlices);
        //                      System.gc();
        //              }
        System.out.println("GoogleSitemapGenerator process is ended, total time takem: "
                + ((System.currentTimeMillis() - start) / 1000));
    }

    private void writeURL(String name) throws IOException {
        if (name != null && name.trim().length() > 0) {
            if (track.contains(name.trim().hashCode())) {
                return;
            }
            if (urlCtr == 0) {
                writeFileHeader();
                track.clear();
            }
            fw.write("<url>\n");
            fw.write("<loc>http://bie.ala.org.au/species/" + java.net.URLEncoder.encode(name.trim(), "UTF-8")
                    + "</loc>\n");
            fw.write("<changefreq>daily</changefreq>\n");
            fw.write("<priority>0.5000</priority>\n");
            fw.write("</url>\n");

            track.add(name.trim().hashCode());
            urlCtr++;
            if (urlCtr >= MAX_NUMBER_URL) {
                writeFileFooter();
                urlCtr = 0;
            }
        }
    }

    private void writeFileHeader() throws IOException {
        //init file write
        System.out.println("**** file created: " + fileName + fileNameCtr + ".xml");
        fw = new FileWriter(fileName + "_" + fileNameCtr + ".xml");

        fw.write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
        fw.write("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n");

        fileNameCtr++;
    }

    private void writeFileFooter() throws IOException {
        fw.write("</urlset>\n");
        fw.flush();
        fw.close();
    }

    private void generateURL(Map<String, Map<String, Object>> rowMaps) {

        for (String guid : rowMaps.keySet()) {
            String names[] = getSciAndCmnName(rowMaps.get(guid), guid);
            if (names != null && "true".equalsIgnoreCase(names[NamePos.IS_AUSTRALIAN.ordinal()])) {
                logger.debug("******** GUID: " + guid + ", SCIENTIFIC_NAME: "
                        + names[NamePos.SCIENTIFIC_NAME.ordinal()] + " urlCtr: " + urlCtr);
                //ignore last column[isAustralian]
                for (int i = 0; i < names.length - 2; i++) {
                    try {
                        if (names[i] != null && !names[i].isEmpty()) {
                            if (i == NamePos.KINGDOM.ordinal()) {
                                writeURL(StringEscapeUtils.escapeXml(names[NamePos.SCIENTIFIC_NAME.ordinal()] + " ("
                                        + names[NamePos.KINGDOM.ordinal()] + ")"));
                            } else {
                                writeURL(StringEscapeUtils.escapeXml(names[i]));
                            }
                        }
                    } catch (IOException e) {
                        logger.error(e);
                        e.printStackTrace();
                        //close file
                        try {
                            writeFileFooter();
                        } catch (IOException e1) {
                            logger.error(e1);
                            e1.printStackTrace();
                        }
                        urlCtr = 0;
                    }
                }
            }
        }

        //              for (KeySlice keySlice : keySlices) {
        //                      for (ColumnOrSuperColumn columns : keySlice.getColumns()) {
        //                              if (columns.isSetSuper_column()) {
        //                                      SuperColumn scol = columns.getSuper_column();                           
        //                                      String[] names = getSciAndCmnName(scol, keySlice.getKey());
        //                                      if(names != null && "true".equalsIgnoreCase(names[NamePos.IS_AUSTRALIAN.ordinal()])){
        //                                              logger.debug("******** GUID: " + keySlice.getKey() + " urlCtr: " + urlCtr);
        //                                              //ignore last column[isAustralian]
        //                                              for(int i = 0; i < names.length - 1; i++){
        //                                                      try {                                                   
        //                                                              if(names[i] != null && !names[i].isEmpty()){                                                                            
        //                                                                      if(i == NamePos.KINGDOM.ordinal()){
        //                                                                              writeURL(StringEscapeUtils.escapeXml(names[NamePos.SCIENTIFIC_NAME.ordinal()] + " (" + names[NamePos.KINGDOM.ordinal()] + ")"));
        //                                                                      }
        //                                                                      else{
        //                                                                              writeURL(StringEscapeUtils.escapeXml(names[i]));
        //                                                                      }
        //                                                              }
        //                                                      } catch (IOException e) {
        //                                                              logger.error(e);
        //                                                              e.printStackTrace();
        //                                                              //close file
        //                                                              try {
        //                                                                      writeFileFooter();
        //                                                              } catch (IOException e1) {
        //                                                                      logger.error(e1);
        //                                                                      e1.printStackTrace();
        //                                                              }
        //                                                              urlCtr = 0;
        //                                                      }
        //                                              }
        //                                      }
        //                              }
        //                      }
        //              }
    }

    private String[] getSciAndCmnName(Map<String, Object> columnMap, String guid) {
        String value = null;
        String colName = null;
        String[] names = new String[] { "", "", "", "", "" };

        if (guid == null || (!guid.trim().contains(APNI_TAXON) && !guid.trim().contains(ADF_TAXON))) {
            return null;
        }

        if (columnMap.containsKey(ColumnType.CLASSIFICATION_COL.getColumnName())) {
            List<Classification> classifications = (List<Classification>) columnMap
                    .get(ColumnType.CLASSIFICATION_COL.getColumnName());
            if (classifications != null && classifications.size() > 0
                    && classifications.get(0).getKingdom() != null) {
                names[NamePos.KINGDOM.ordinal()] = classifications.get(0).getKingdom().trim();
            }
        }
        if (columnMap.containsKey(ColumnType.VERNACULAR_COL.getColumnName())) {
            List<CommonName> commonNames = (List<CommonName>) columnMap
                    .get(ColumnType.VERNACULAR_COL.getColumnName());
            if (commonNames != null) {
                for (int i = 0; i < commonNames.size(); i++) {
                    if (commonNames.get(i).isPreferred()) {
                        names[NamePos.COMMON_NAME.ordinal()] = commonNames.get(i).getNameString().trim();
                        break;
                    }
                }
            }
        }
        if (columnMap.containsKey(ColumnType.TAXONCONCEPT_COL.getColumnName())) {
            TaxonConcept taxonConcept = (TaxonConcept) columnMap.get(ColumnType.TAXONCONCEPT_COL.getColumnName());
            names[NamePos.SCIENTIFIC_NAME.ordinal()] = taxonConcept.getNameString().trim();
        }
        if (columnMap.containsKey(ColumnType.IS_AUSTRALIAN.getColumnName())) {
            names[NamePos.IS_AUSTRALIAN.ordinal()] = columnMap.get(ColumnType.IS_AUSTRALIAN.getColumnName())
                    .toString().trim();
        }

        if (columnMap.containsKey(ColumnType.TAXONNAME_COL.getColumnName())) {
            List<TaxonName> taxonNames = (List<TaxonName>) columnMap.get(ColumnType.TAXONNAME_COL.getColumnName());
            if (taxonNames != null && taxonNames.size() > 0 && taxonNames.get(0).getNameComplete() != null) {
                names[NamePos.NAME_COMPLETE.ordinal()] = taxonNames.get(0).getNameComplete().trim();
            }
        }

        // replace scientic name with name complete.
        if (names[NamePos.NAME_COMPLETE.ordinal()] != null && !"".equals(names[NamePos.NAME_COMPLETE.ordinal()])) {
            names[NamePos.SCIENTIFIC_NAME.ordinal()] = names[NamePos.NAME_COMPLETE.ordinal()];
        }

        //scan all columns
        //              for (Column col : scol.getColumns()) {
        //                      try {
        //                              value = new String(col.getValue(), CHARSET_ENCODING);
        //                              colName = new String(col.getName(), CHARSET_ENCODING);
        //                              if("hasClassification".equalsIgnoreCase(colName)){
        //                                      List<Classification> classifications = mapper.readValue(value, TypeFactory.collectionType(ArrayList.class, Classification.class));
        //                                      if(classifications != null && classifications.size() > 0){
        //                                              names[NamePos.KINGDOM.ordinal()] = classifications.get(0).getKingdom();
        //                                      }
        //                              }
        //                              else if("hasVernacularConcept".equalsIgnoreCase(colName)){
        //                                      List<CommonName> commonNames = mapper.readValue(value, TypeFactory.collectionType(ArrayList.class, CommonName.class));
        //                                      if(commonNames != null ){
        //                                              for(int i = 0; i < commonNames.size(); i++){
        //                                                      if(commonNames.get(i).isPreferred()){
        //                                                              names[NamePos.COMMON_NAME.ordinal()] = commonNames.get(i).getNameString();
        //                                                              break;
        //                                                      }
        //                                              }
        //                                      }
        //                              }
        //                              else if("taxonConcept".equalsIgnoreCase(colName)){
        //                                      TaxonConcept taxonConcept = mapper.readValue(value, TaxonConcept.class);
        //                                      names[NamePos.SCIENTIFIC_NAME.ordinal()] = taxonConcept.getNameString();
        //                              }
        //                              else if("IsAustralian".equalsIgnoreCase(colName)){
        //                                      names[NamePos.IS_AUSTRALIAN.ordinal()] = value;
        //                              }
        //                      } catch (Exception e) {
        //                              logger.error(e);
        //                      }       
        //              }       

        return names;
    }

    //========= Getter =======
    public static int getRows() {
        return ROWS;
    }

    public String getKeyspace() {
        return keyspace;
    }

    public String getColumnFamily() {
        return columnFamily;
    }

    // =========== setter ===========
    public void setFileName(String fileName) {
        this.fileName = fileName;
    }

    public void setKeyspace(String keyspace) {
        this.keyspace = keyspace;
    }

    public void setColumnFamily(String columnFamily) {
        this.columnFamily = columnFamily;
    }
}