Example usage for org.apache.lucene.index IndexWriterConfig setOpenMode

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setOpenMode.

Prototype

public IndexWriterConfig setOpenMode(OpenMode openMode)

Source Link

Document

Specifies OpenMode of the index.

Usage

From source file:it.drwolf.ridire.index.sketch.SketchCreatorManager.java

License:Apache License

public void closeIndex() {
    try {/*from   ww w .  j a va 2 s.  com*/
        if (this.sketchCreatorData.getIndexWriter() != null) {
            this.sketchCreatorData.getIndexWriter().close();
        } else {
            String indexLocation = this.entityManager
                    .find(Parameter.class, Parameter.SKETCH_INDEX_LOCATION.getKey()).getValue();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_33,
                    new KeywordAnalyzer());
            indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
            IndexWriter indexWriter = new IndexWriter(new MMapDirectory(new File(indexLocation)),
                    indexWriterConfig);
            if (indexWriter != null) {
                indexWriter.close();

            }
        }
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

From source file:it.drwolf.ridire.index.sketch.SketchCreatorManager.java

License:Apache License

private void doCreateSketches(String indexLocation) {
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_33, new KeywordAnalyzer());
    indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
    try {/*from  ww  w .ja v a2  s.c  o m*/
        IndexWriter indexWriter = new IndexWriter(new MMapDirectory(new File(indexLocation)),
                indexWriterConfig);
        this.sketchCreatorData.setIndexWriter(indexWriter);
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    this.sketchCreatorData.setProcessNumber(this.processNumber);
    this.sketchCreatorData.setWorkingDir(this.workingDir);
    this.sketchCreator.createSketches(this.sketchCreatorData);
}

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;//w  w w  .  ja v a2s.  c om
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:javatools.webapi.LuceneIndexFiles.java

License:Apache License

public static void indexDelimitedFile(String file, int indexColumn, int pathColumn, String dirIndex) {
    Date start = new Date();

    try {//w  ww.  jav  a2 s  .c om
        if ((new File(dirIndex)).exists()) {
            (new File(dirIndex)).delete();
        }
        Directory dir = FSDirectory.open(new File(dirIndex));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, file, indexColumn, pathColumn);

        // NOTE: if you want to maximize search performance,
        // you can optionally call optimize here.  This can be
        // a costly operation, so generally it's only worth
        // it when your index is relatively static (ie you're
        // done adding documents to it):
        //
        // writer.optimize();

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:javatools.webapi.LuceneIndexFiles.java

License:Apache License

public static void indexDelimitedFile(String file, int indexColumn, int[] pathColumn, String dirIndex) {
    Date start = new Date();

    try {/*  ww  w  .j a v  a2  s.  co  m*/
        if ((new File(dirIndex)).exists()) {
            (new File(dirIndex)).delete();
        }
        Directory dir = FSDirectory.open(new File(dirIndex));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, file, indexColumn, pathColumn);

        // NOTE: if you want to maximize search performance,
        // you can optionally call optimize here.  This can be
        // a costly operation, so generally it's only worth
        // it when your index is relatively static (ie you're
        // done adding documents to it):
        //
        // writer.optimize();

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:javatools.webapi.LuceneIndexFiles.java

License:Apache License

public static void indexDelimitedFile(String file, int[] indexColumn, int[] pathColumn, String dirIndex) {
    Date start = new Date();

    try {//from www. j  a v  a2 s .  co m
        if ((new File(dirIndex)).exists()) {
            (new File(dirIndex)).delete();
        }
        Directory dir = FSDirectory.open(new File(dirIndex));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, file, indexColumn, pathColumn);

        // NOTE: if you want to maximize search performance,
        // you can optionally call optimize here.  This can be
        // a costly operation, so generally it's only worth
        // it when your index is relatively static (ie you're
        // done adding documents to it):
        //
        // writer.optimize();

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:javatools.webapi.LuceneIndexFiles.java

License:Apache License

/** Index all text files under a directory. */
public static void main2(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            // TODO: Change the link with every release (or: fill in some less error-prone alternative here...)
            + "See http://lucene.apache.org/java/3_1/demo.html for details.";
    String indexPath = "index";
    String docsPath = null;//ww w  . ja va 2s  .  c  o m

    boolean create = true;
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];
            i++;
        } else if ("-docs".equals(args[i])) {
            docsPath = args[i + 1];
            i++;
        } else if ("-update".equals(args[i])) {
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call optimize here.  This can be
        // a costly operation, so generally it's only worth
        // it when your index is relatively static (ie you're
        // done adding documents to it):
        //
        // writer.optimize();

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:kbp2013.index.IndexSourceCorpus.java

License:Open Source License

public static void main(String[] args) throws IOException {

    initializeFromDefault();//ww w  . j a  v  a2s  .  c o  m

    int managed = 0; // counter to count idents
    int counted = 0; // when to display
    int tocount = 10;

    System.out.println("Indexing to directory '" + luceneIndex + "'...");

    INDEX_DIR = new File(luceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(luceneIndex));

    // Open lucene stuff 
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    // iwc.setRAMBufferSizeMB(1024); // http://wiki.apache.org/lucene-java/ImproveIndexingSpeed
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, checkindex
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    final File docDir = new File(home);
    System.out.println("Indexing directory '" + home + "'...");
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    // read all the files
    BufferedReader reader = new BufferedReader(new FileReader(homelist));

    // read line by line each file name
    String text = "";
    boolean verbose = true;

    while ((text = reader.readLine()) != null) {

        String filename = home + text;
        final File testFile = new File(filename);

        // verbose - remove from one line files
        if (verbose) {
            System.out.println("---V-->" + "Indexing content of " + filename);
        }

        if (testFile.isFile() && !filename.contains("\\.gz")) {

            // open file and read
            FileReader fread = new FileReader(filename);
            BufferedReader readerDoc = new BufferedReader(fread);

            // initialize variable for loop
            String fileRef = ""; // the line containing the document id
            String fromfile = ""; // the first reader for all the file
            String textdoc = ""; // inside the file the reader for the document

            while ((fromfile = readerDoc.readLine()) != null) {
                if (fromfile.toUpperCase().contains("<DOC ID=") || fromfile.toUpperCase().contains("<DOC>")) {

                    String fromdoc = fromfile; // begin to index the DOCID (to keep good offset for collection of mention)
                    textdoc = fromfile; // initialize variable and keep the first line

                    // accumulate all the content
                    while (!fromdoc.toUpperCase().contains("</DOC>")) {

                        // collect the doc id
                        // store the current file ref
                        // it can come :
                        //     - from the last fromfile (first iteration)
                        //     - from a current iteration of fromdoc (any iteration)

                        if (fromdoc.toUpperCase().contains("<DOC ID=")
                                || fromdoc.toUpperCase().contains("<DOCID>")) {
                            fileRef = fromdoc;
                        }

                        // accumulate the complete document for later offset reading of mention
                        fromdoc = readerDoc.readLine();
                        textdoc = textdoc + "\n" + fromdoc;

                    }

                    // locate id
                    // 2 forms
                    // <DOCID> ALHURRA_NEWS13_ARB_20050412_130100-2.LDC2006E92 </DOCID>
                    // <doc id="bolt-eng-DF-183-195681-7948494">
                    // form 1
                    String idStr = fileRef;

                    if (idStr.contains("<DOCID>")) {
                        idStr = idStr.replace("<DOCID>", "");
                        idStr = idStr.replace("</DOCID>", "");
                        idStr = idStr.replace(" ", ""); // retire l'espace
                    }
                    if (idStr.contains("<DOC id=")) {

                        idStr = idStr.replace("<DOC id=\"", "");
                        idStr = idStr.replaceAll("\".+>$", "");
                        //idStr = idStr.replaceAll("\">$", "");
                    }
                    // lower case ->new corpus of LDC
                    /*
                    if (idStr.contains("<docid>")){
                       idStr = idStr.replace("<docid>", "");
                       idStr = idStr.replace("</docid>", "");
                       idStr = idStr.replace(" ", ""); // retire l'espace
                    }
                    if (idStr.contains("<doc id=")){
                            
                       idStr = idStr.replace("<doc id=\"", "");
                       idStr = idStr.replaceAll("\".+>$", "");
                       // idStr = idStr.replaceAll("\">$", "");
                    }      
                    */

                    indexDocs(writer, idStr, textdoc);

                    // display info
                    managed++;
                    counted++;

                    // verbose remove for 1 doc files
                    if (verbose) {
                        System.out.println(
                                "---V-->" + counted + ":" + filename + ":" + idStr + ":" + textdoc.length());
                    }

                    if (managed > tocount) {
                        managed = 0;
                        System.out.println(counted + ":" + filename + ":------>" + idStr);

                        // clean the writer
                        //writer.waitForMerges();
                        //writer.forceMergeDeletes();
                        writer.commit();
                    }
                } // end of if

            } // end of while
            readerDoc.close();
            fread.close();

        } else {

            System.out.println(counted + ":Non lisible ou non requis:" + filename);

        }

    }

    // close properly the index writer 
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and has to be regenerated
    writer.close();
    reader.close();

}

From source file:kbp2013.index.IndexSourceCorpus_v2.java

License:Open Source License

/**
 * //from w  w  w.jav a  2s .  c o  m
 * 
 * 
 * @param  args
 * @throws IOException
 * @throws FileNotFoundException
 * @throws ClassNotFoundException
 * @throws Exception
 */
public static void main(String[] args)
        throws IOException, FileNotFoundException, ClassNotFoundException, Exception {

    Date start = new Date();
    Directory targetIndexDir = FSDirectory.open(indexDir);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);

    if (indexDir.exists() == false) {
        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

    } else {
        System.err.println("Adding files to existing index: '" + indexDir);
        // Add new documents to an existing index:
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    }

    //set ram buffer size (optional)
    iwc.setRAMBufferSizeMB(256.0);

    IndexWriter writer = new IndexWriter(targetIndexDir, iwc);

    System.err.println("Indexing to directory '" + indexDir + "'...");
    int docCount = 1;

    BufferedReader reader = new BufferedReader(new FileReader(inputLstFile));
    String currentDocPath;
    while ((currentDocPath = reader.readLine()) != null) {
        currentDocPath = currentDocPath.trim();
        System.err.println("Processing file: " + currentDocPath);
        //Processing each gzip file
        InputStream fileInputStream = new BufferedInputStream(new FileInputStream(currentDocPath));
        InputStreamReader streamReader;
        GZIPInputStream zipReader = null;

        if (currentDocPath.endsWith(".gz")) {
            //case when the file to index is a gzip file
            zipReader = new GZIPInputStream(fileInputStream);
            streamReader = new InputStreamReader(zipReader);
        } else {
            streamReader = new InputStreamReader(fileInputStream);
        }

        BufferedReader br = new BufferedReader(streamReader);

        String docTitle = "";
        String fileContent = "";
        String line;
        String docId = "";
        //String rawCnt = "";

        StringBuilder pageBuffer = new StringBuilder();
        //raw content with the "\n"
        StringBuilder rawPageBuffer = new StringBuilder();

        while ((line = br.readLine()) != null) {
            if (StringUtils.contains(line.toLowerCase(), "</doc>") == true) {
                pageBuffer.append(line).append(" ");
                rawPageBuffer.append(line + "MY_CUSTOM_SPACE");
                //rawCnt = rawCnt + "MY_CUSTOM_SPACE" + line;
                if (pageBuffer.length() > 0) {
                    fileContent = pageBuffer.toString().replaceAll("  ", " ");
                    docId = extractDocId(fileContent);
                    //get the title of the page
                    docTitle = extractTitle(fileContent);
                    //get the content of the page
                    String content = extractContent(fileContent);
                    String rawContent = extractRawContent(rawPageBuffer.toString());

                    indexDocument(writer, docId, content, docTitle, rawContent);
                    System.err.println("Processed " + docCount + " documents");
                    docCount++;

                }
                //reset buffer
                pageBuffer = new StringBuilder();
                rawPageBuffer = new StringBuilder();
                //rawCnt = "";
            }
            pageBuffer.append(line).append(" ");
            rawPageBuffer.append(line + "MY_CUSTOM_SPACE");

        }
        fileInputStream.close();
        if (currentDocPath.endsWith(".gz") && zipReader != null) {
            zipReader.close();
        }
        streamReader.close();
    }

    reader.close();
    writer.close();

    Date end = new Date();
    System.err.println(end.getTime() - start.getTime() + " total milliseconds");

}

From source file:kbp2013.index.IndexWikipediaCorpus.java

License:Open Source License

public static void main(String[] args) throws IOException {

    initializeFromDefault();//from   ww  w .  j av a2  s  . c  o m

    int managed = 0; // counter to count idents
    int counted = 0; // when to display
    int tocount = 1000;
    int saved = 0;

    System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'...");

    INDEX_DIR = new File(wikiluceneIndex);
    if (INDEX_DIR.exists() && create == 1) {
        System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first");
        System.exit(1);
    }

    Directory dir = FSDirectory.open(new File(wikiluceneIndex));

    // Open lucene stuff 
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);
    // configure Lucene Stuff
    iwc.setMaxThreadStates(100);

    // manage append mode
    if (create == 0) {
        // add new document to an existing index
        iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        // if appending, check index
        if (checkindex == 1) {
            System.out.println("Checking index ...");
            CheckIndex ci = new CheckIndex(dir);
            ci.checkIndex();
            System.out.println("End of Checking index");
        }

    } else {
        iwc.setOpenMode(OpenMode.CREATE);
    }

    // build writer
    IndexWriter writer = new IndexWriter(dir, iwc);

    // --------------------------
    //
    // Open the Wikipedia Dump
    //
    //---------------------------
    BufferedReader reader = new BufferedReader(new FileReader(wikidump));

    // read the domains
    String text = "";
    ArrayList domain = new ArrayList(); // the content retrieved according to the page key

    while (!text.contains("</siteinfo>")) {
        text = reader.readLine();
        if (text.contains("<namespace key=") && !text.contains("<namespace key=\"0")) {

            String thisnamespace = text.replaceAll("<namespace key=[^>]+>", "");
            thisnamespace = thisnamespace.replaceAll("</namespace>", "");
            thisnamespace = thisnamespace.replaceAll("^[ ]+", "");
            thisnamespace = thisnamespace + ":";
            if (!thisnamespace.contentEquals("")) {
                domain.add(thisnamespace);
                System.out.println("Registered domain:" + thisnamespace + ";");
            }
        }
    }

    System.out.println("--------------------------------");

    // read the pages
    while ((text = reader.readLine()) != null) {

        String textdoc = ""; // inside the file, the reader for the document
        String pagename = "";
        boolean tosave = true;

        // beginning of a page
        // accumulate
        if (text.contains("<page>")) {

            textdoc = text;

            while (!text.contains("</page>")) {
                text = reader.readLine();
                textdoc = textdoc + text;

                if (text.contains("<title>")) {

                    pagename = text.replaceAll("<title>", "");
                    pagename = pagename.replaceAll("</title>", "");
                    pagename = pagename.replaceAll("[ ]{2,10}", "");
                    //System.out.println("Page:" + pagename);

                }

                // safety

            }

            // after page reading index document
            // verify if document 
            //         A) is not a redirect
            //         B) is not from a domain
            for (int a = 0; a < domain.size(); a++) {
                String domaintosearch = domain.get(a).toString();
                if (pagename.toLowerCase().contains(domaintosearch.toLowerCase())) {
                    System.out.println("Specific page:" + pagename);
                    tosave = false;
                }
            }
            /*
            if (textdoc.contains("[A-Za-z ]+:")){
               System.out.println("Specific page domain:" + pagename);
               tosave = false;
            }*/
            if (textdoc.contains("#REDIRECT")) {
                // System.out.println("Redirect:" + pagename);
                tosave = false;
            }

            if (tosave) {
                saved++;
                indexDocs(writer, pagename, textdoc);
            }

            // display info
            managed++;
            counted++;

            if (managed > tocount) {
                managed = 0;
                System.out.println(counted + ":" + saved + ":" + pagename + ":------>" + textdoc.length());
                // System.out.println(textdoc);
                writer.commit();
            }
        }

    } // end while

    // close properly the index writer 
    // !! Caution !! in case of error, if this is not closed, the index is corrupted
    // and has to be regenerated
    writer.close();
    reader.close();

}