MediaWikiParserFactory.java :  » Wiki-Engine » jwpl » de » tudarmstadt » ukp » wikipedia » parser » mediawiki » Java Open Source

Java Open Source » Wiki Engine » jwpl 
jwpl » de » tudarmstadt » ukp » wikipedia » parser » mediawiki » MediaWikiParserFactory.java
/*******************************************************************************
 * Copyright (c) 2010 Torsten Zesch.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/lgpl.html
 * 
 * Contributors:
 *     Torsten Zesch - initial API and implementation
 ******************************************************************************/
package de.tudarmstadt.ukp.wikipedia.parser.mediawiki;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language;

/**
 * Class for easy creating a configurated MediaWiki Parser...<br/>
 * @author CJacobi
 *
 */
public class MediaWikiParserFactory {

  private final Log logger = LogFactory.getLog(getClass());

  private Class parserClass;
  private Class templateParserClass;
  private String lineSeparator;
  private List<String> deleteTemplates;
  private List<String> parseTemplates;
  private List<String> categoryIdentifers;
  private List<String> languageIdentifers;
  private List<String> imageIdentifers;
  private boolean showImageText;
  private boolean deleteTags;
  private boolean showMathTagContent;
  private boolean calculateSrcSpans;

  /**
   * Creates a new UNCONFIGURATED Parser Factory.
   */
  public MediaWikiParserFactory(){
    initVariables();
  }

  /**
   * Creates a fully configurated parser factory for the specified language.<br/>
   * Next step is .createParser()...
   */
  protected MediaWikiParserFactory(Language language){
        initVariables();
        if (language.equals(Language.german)) {
            initGermanVariables();
        }
        else {
            logger.warn("Language not Implemented. Using default values.");
        }
  }

  private void initVariables(){
        lineSeparator = "LF";
    parserClass = ModularParser.class;
    imageIdentifers = new ArrayList<String>();
    categoryIdentifers = new ArrayList<String>();
    languageIdentifers = new ArrayList<String>();
    deleteTemplates = new ArrayList<String>();
    parseTemplates = new ArrayList<String>();
    showImageText = false;
    deleteTags = true;
    showMathTagContent = true;
    calculateSrcSpans = false;
    templateParserClass = ShowTemplateNamesAndParameters.class;

        initLanguages();
  }

  private void initLanguages(){
    //Init the Languages...
    languageIdentifers.add("aa");languageIdentifers.add("ab");languageIdentifers.add("af");
    languageIdentifers.add("am");languageIdentifers.add("an");languageIdentifers.add("ar");
    languageIdentifers.add("as");languageIdentifers.add("av");languageIdentifers.add("ay");
    languageIdentifers.add("az");

    languageIdentifers.add("ba");languageIdentifers.add("be");languageIdentifers.add("bg");
    languageIdentifers.add("bh");languageIdentifers.add("bi");languageIdentifers.add("bm");
    languageIdentifers.add("bn");languageIdentifers.add("bo");languageIdentifers.add("br");
    languageIdentifers.add("bs");

    languageIdentifers.add("ca");languageIdentifers.add("ce");languageIdentifers.add("ch");
    languageIdentifers.add("co");languageIdentifers.add("cr");languageIdentifers.add("cs");
    languageIdentifers.add("cv");languageIdentifers.add("cy");

    languageIdentifers.add("da");languageIdentifers.add("de");languageIdentifers.add("dk");
    languageIdentifers.add("dv");languageIdentifers.add("dz");

    languageIdentifers.add("ee");languageIdentifers.add("el");languageIdentifers.add("en");
    languageIdentifers.add("eo");languageIdentifers.add("es");languageIdentifers.add("et");
    languageIdentifers.add("eu");

    languageIdentifers.add("fa");languageIdentifers.add("ff");languageIdentifers.add("fi");
    languageIdentifers.add("fj");languageIdentifers.add("fo");languageIdentifers.add("fr");
    languageIdentifers.add("fy");

    languageIdentifers.add("ga");languageIdentifers.add("gd");languageIdentifers.add("gl");
    languageIdentifers.add("gn");languageIdentifers.add("gu");languageIdentifers.add("gv");

    languageIdentifers.add("ha");languageIdentifers.add("he");languageIdentifers.add("hi");
    languageIdentifers.add("hr");languageIdentifers.add("ht");languageIdentifers.add("hu");
    languageIdentifers.add("hy");

    languageIdentifers.add("ia");languageIdentifers.add("id");languageIdentifers.add("ie");
    languageIdentifers.add("ig");languageIdentifers.add("ii");languageIdentifers.add("ik");
    languageIdentifers.add("io");languageIdentifers.add("is");languageIdentifers.add("it");
    languageIdentifers.add("iu");

    languageIdentifers.add("ja");languageIdentifers.add("jv");

    languageIdentifers.add("ka");languageIdentifers.add("kg");languageIdentifers.add("ki");
    languageIdentifers.add("kk");languageIdentifers.add("kl");languageIdentifers.add("km");
    languageIdentifers.add("kn");languageIdentifers.add("ko");languageIdentifers.add("ks");
    languageIdentifers.add("ku");languageIdentifers.add("kv");languageIdentifers.add("kw");
    languageIdentifers.add("ky");

    languageIdentifers.add("la");languageIdentifers.add("lb");languageIdentifers.add("li");
    languageIdentifers.add("ln");languageIdentifers.add("lo");languageIdentifers.add("lt");
    languageIdentifers.add("lv");

    languageIdentifers.add("mg");languageIdentifers.add("mh");languageIdentifers.add("mi");
    languageIdentifers.add("mk");languageIdentifers.add("ml");languageIdentifers.add("mn");
    languageIdentifers.add("mo");languageIdentifers.add("mr");languageIdentifers.add("ms");
    languageIdentifers.add("mt");languageIdentifers.add("my");

    languageIdentifers.add("na");languageIdentifers.add("nb");languageIdentifers.add("ne");
    languageIdentifers.add("ng");languageIdentifers.add("nl");languageIdentifers.add("nn");
    languageIdentifers.add("no");languageIdentifers.add("nv");languageIdentifers.add("ny");

    languageIdentifers.add("oc");languageIdentifers.add("os");languageIdentifers.add("pa");
    languageIdentifers.add("pl");languageIdentifers.add("ps");languageIdentifers.add("pt");

    languageIdentifers.add("qu");

    languageIdentifers.add("rm");languageIdentifers.add("rn");languageIdentifers.add("ro");
    languageIdentifers.add("ru");languageIdentifers.add("rw");

    languageIdentifers.add("sa");languageIdentifers.add("sc");languageIdentifers.add("sd");
    languageIdentifers.add("se");languageIdentifers.add("sg");languageIdentifers.add("sh");
    languageIdentifers.add("si");languageIdentifers.add("sk");languageIdentifers.add("sl");
    languageIdentifers.add("sm");languageIdentifers.add("sn");languageIdentifers.add("so");
    languageIdentifers.add("sq");languageIdentifers.add("sr");languageIdentifers.add("ss");
    languageIdentifers.add("st");languageIdentifers.add("su");languageIdentifers.add("sv");
    languageIdentifers.add("sw");

    languageIdentifers.add("ta");languageIdentifers.add("te");languageIdentifers.add("tg");
    languageIdentifers.add("th");languageIdentifers.add("ti");languageIdentifers.add("tk");
    languageIdentifers.add("tl");languageIdentifers.add("tn");languageIdentifers.add("to");
    languageIdentifers.add("tr");languageIdentifers.add("ts");languageIdentifers.add("tt");
    languageIdentifers.add("tw");languageIdentifers.add("ty");

    languageIdentifers.add("ug");languageIdentifers.add("uk");languageIdentifers.add("ur");
    languageIdentifers.add("uz");

    languageIdentifers.add("ve");languageIdentifers.add("vi");languageIdentifers.add("vo");

    languageIdentifers.add("wa");languageIdentifers.add("wo");

    languageIdentifers.add("xh");

    languageIdentifers.add("yi");languageIdentifers.add("yo");

    languageIdentifers.add("za");languageIdentifers.add("zh");languageIdentifers.add("zu");

    languageIdentifers.add("als");languageIdentifers.add("ang");languageIdentifers.add("arc");languageIdentifers.add("ast");
    languageIdentifers.add("bug");
    languageIdentifers.add("ceb");languageIdentifers.add("chr");languageIdentifers.add("chy");languageIdentifers.add("csb");
    languageIdentifers.add("frp");
    languageIdentifers.add("fur");
    languageIdentifers.add("got");
    languageIdentifers.add("haw");
    languageIdentifers.add("ilo");
    languageIdentifers.add("jbo");
    languageIdentifers.add("ksh");
    languageIdentifers.add("lad");languageIdentifers.add("lmo");
    languageIdentifers.add("nah");languageIdentifers.add("nap");languageIdentifers.add("nds");languageIdentifers.add("nrm");
    languageIdentifers.add("pam");languageIdentifers.add("pap");languageIdentifers.add("pdc");languageIdentifers.add("pih");languageIdentifers.add("pms");
    languageIdentifers.add("rmy");
    languageIdentifers.add("scn");languageIdentifers.add("sco");
    languageIdentifers.add("tet");languageIdentifers.add("tpi");languageIdentifers.add("tum");
    languageIdentifers.add("udm");
    languageIdentifers.add("vec");languageIdentifers.add("vls");
    languageIdentifers.add("war");
    languageIdentifers.add("xal");

    languageIdentifers.add("simple");
  }

  private void initGermanVariables(){
    templateParserClass = GermanTemplateParser.class;
    deleteTemplates.add( "Prettytable" );
    parseTemplates.add( "Dieser Artikel" );
    parseTemplates.add( "Audio" );
    parseTemplates.add( "Video" );
    imageIdentifers.add("Bild");
    categoryIdentifers.add( "Kategorie" );
    languageIdentifers.remove("de");
  }

  private String resolveLineSeparator(){
    if( lineSeparator.equals("CRLF")) {
      return "\r\n";
    }
    if( lineSeparator.equals("LF")) {
      return "\n";
    }

    logger.error(
        "LineSeparator is UNKNOWN: \""+lineSeparator+"\"\n" +
        "Set LineSeparator to \"LF\" or \"CRLF\" for a Error free configuration" );

    return lineSeparator;
  }

  /**
   * Creates a MediaWikiParser with the configurations which has been set.
   */
  public MediaWikiParser createParser(){
    logger.debug( "Selected Parser: " + parserClass );

    if( parserClass == ModularParser.class ){
      ModularParser mwgp = new ModularParser(
//          resolveLineSeparator(),
          "\n",
          languageIdentifers,
          categoryIdentifers,
          imageIdentifers,
          showImageText,
          deleteTags,
          showMathTagContent,
          calculateSrcSpans,
          null );

      StringBuilder sb = new StringBuilder();
      sb.append( lineSeparator + "languageIdentifers: ");
      for( String s: languageIdentifers ) {
        sb.append( s + " ");
      }
      sb.append( lineSeparator + "categoryIdentifers: ");
      for( String s: categoryIdentifers ) {
        sb.append( s + " ");
      }
      sb.append( lineSeparator + "imageIdentifers: ");
      for( String s: imageIdentifers ) {
        sb.append( s + " ");
      }
      logger.debug( sb.toString() );

      MediaWikiTemplateParser mwtp;

      logger.debug( "Selected TemplateParser: "+ templateParserClass);
      if( templateParserClass == GermanTemplateParser.class ){
        for( String s: deleteTemplates) {
          logger.debug( "DeleteTemplate: '" + s + "'");
        }
        for( String s: parseTemplates) {
          logger.debug( "ParseTemplate: '" + s + "'");
        }
        mwtp = new GermanTemplateParser( mwgp, deleteTemplates, parseTemplates );
      }
      else if( templateParserClass == FlushTemplates.class ) {
        mwtp = new FlushTemplates();
      } else if( templateParserClass == ShowTemplateNamesAndParameters.class ){
        mwtp = new ShowTemplateNamesAndParameters();
      }
      else{
        logger.error("TemplateParser Class Not Found!");
        return null;
      }

      mwgp.setTemplateParser( mwtp );

      return mwgp;
    }
    else{
      logger.error("Parser Class Not Found!");
      return null;
    }
  }

  /**
   * Adds a Template which should be deleted while the parsing process.
   */
  public void addDeleteTemplate( String deleteTemplate ){
    deleteTemplates.add( deleteTemplate );
  }

  /**
   * Adds a Template which should be "parsed" while the parsing process.
   */
  public void addParseTemplate( String parseTemplate ){
    parseTemplates.add( parseTemplate );
  }

  /**
   * Retuns the Class of the selected Parser.
   */
  public Class getParserClass(){
    return parserClass;
  }

  /**
   * Set the Parser which should be configurated and returned by createParser().
   */
  public void setParserClass(Class parserClass){ this.parserClass = parserClass; }

  /**
   * Returns the Class of the selected TemplateParser.
   */
  public Class getTemplateParserClass(){ return templateParserClass; }

  /**
   * Set the Parser which should be used for Template parsing.
   */
  public void setTemplateParserClass(Class templateParserClass){ this.templateParserClass = templateParserClass; }

  /**
   * Retuns the List of templates which should be deleted in the parseing process.
   */
  public List<String> getDeleteTemplates(){ return deleteTemplates; }

  /**
   * Set the List of templates which should be deleted in the parseing process.
   */
  public void setDeleteTemplates(List<String> deleteTemplates){ this.deleteTemplates = deleteTemplates; }

  /**
   * Returns the CharSequence/String which should be used as line separator.
   */
  public String getLineSeparator(){ return lineSeparator; }

  /**
   * Sets the CharSequence/String which should be used as line separator.
   */
  public void setLineSeparator(String lineSeparator){  this.lineSeparator = lineSeparator; }

  /**
   * Returns the List of templates which should be "parsed" in the parseing process.
   */
  public List<String> getParseTemplates(){ return parseTemplates; }

  /**
   * Sets the List of templates which should be "parsed" in the parseing process.
   */
  public void setParseTemplates(List<String> parseTemplates){ this.parseTemplates = parseTemplates; }

  /**
   * Returns the List of Strings which are used to specifiy that a link is a link to a
   * wikipedia i another language.
   */
  public List<String> getLanguageIdentifers(){ return languageIdentifers; }

  /**
   * Sets the list of language identifiers.
   */
  public void setLanguageIdentifers( List<String> languageIdentifers){ this.languageIdentifers = languageIdentifers; }

  /**
   * Returns the List of Strings which are used to specifiy that a link is a link to a
   * cathegory. E.g. in german "Kathegorie" is used. But it could be usefull to use more
   * than one identifier, mainly the english identifier "cathegory" should be used too.
   */
  public List<String> getCategoryIdentifers( ){ return categoryIdentifers; }

  /**
   * Set the list of cathegory identifers.
   */
  public void setCategoryIdentifers( List<String> categoryIdentifers){ this.categoryIdentifers = categoryIdentifers; }

  /**
   * Returns the List of Strings which are used to specifiy that a link is an Image.
   */
  public List<String> getImageIdentifers( ){ return imageIdentifers; }

  /**
   * Sets the image identifer list.
   */
  public void setImageIdentifers( List<String> imageIdentifers){ this.imageIdentifers = imageIdentifers; }

  /**
   * Returns if the Parser should show the Text of an Image, or delete it. If the Text is deleted,
   * it will be added as a Parameter to the Link.
   * @return true, if the Text should be shown.
   */
  public boolean getShowImageText(){ return showImageText; }

  /**
   * Sets if the Parser should show the Text of an Image, or delete it.
   */
  public void setShowImageText( boolean showImageText ){ this.showImageText = showImageText; }

  /**
   * Returns if &lf; * > tags should be deleted or annotaded.
   * @return true if the tags should be deleted.
   */
  public boolean getDeleteTags() { return deleteTags; }

  /**
   * Sets if &lf; * > tags should be deleted or annotaded.
   */
  public void setDeleteTags(boolean deleteTags) {  this.deleteTags = deleteTags; }

  /**
   * Retruns if the Content of math tags (&lf;math>&lf;CONTENT/math>) should be deleted or annotated.
   * @return true, if the tag content should be annotated.
   */
  public boolean getShowMathTagContent() { return showMathTagContent; }

  /**
   * Set if the Contetn of math tags should be deleted or annotated.
   */
  public void setShowMathTagContent(boolean showMathTagContent) { this.showMathTagContent = showMathTagContent; }

  /**
   * Returns if the Parser should calculate the positions in the original source of the elements
   * which are parsed.
   * @return true, if the positions should be calulated.
   */
  public boolean getCalculateSrcSpans() {  return calculateSrcSpans; }

  /**
   * Sets if the Parser should calculate the positions in the original source of the elements
   * which are parsed.
   */
  public void setCalculateSrcSpans(boolean calculateSrcSpans) { this.calculateSrcSpans = calculateSrcSpans; }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.