package Schmortopf.SearchResults;
/**
* Contains static search methods used by text searches.
*
*/
import java.util.*;
import Schmortopf.Utility.Text.*;
import Shared.Logging.Log;
public class SearchMethods
{
// Available search methods:
public final static String[] ApproximateSearchMethodNames = { "Edit-distance", "Soundex"}; // , "Metaphone", "Editex"
// More would be ridiculous, cause the user would be confronted
// with a huge tree. One search can go over thousands of single files searches.
private final static int MaximumNumberOfHitsPerSingleSearch = 2000;
/**
* @returns an array with all positions of found occurences of the
* rawSearchedText in the textBuffer.
*
* @param doApproximateSearch
* @param approximateSearchMethodIndex is the index of the method in the array ApproximateSearchMethodNames
* @param tolerance is the tolerance that permit to identify approximate match.
*
*/
public static int[] SearchTextOccurences( final String rawSearchedText,
final boolean doCaseSensitiveSearch,
final boolean doWholeWordsSearch,
final boolean doApproximateSearch,
final int approximateSearchMethodIndex,
final int tolerance,
final StringBuffer textBuffer,
final boolean doReturnAfterFirstMatch )
{
return( doApproximateSearch ?
SearchApproximateWordOccurences(rawSearchedText, approximateSearchMethodIndex, tolerance, doReturnAfterFirstMatch, textBuffer ):
SearchExactTextOccurences(rawSearchedText, doCaseSensitiveSearch, doWholeWordsSearch, doReturnAfterFirstMatch, textBuffer )
);
}
public final static boolean debug = false;
/** perform an approximate word search, using soundex to locate words
and editdistance to measure matching
@param editTolerance 0 return exact match, 1 allow one error
(insert char, delete char, ...)
*/
private static int[] SearchApproximateWordOccurences( final String rawSearchedText,
final int approximateSearchMethodIndex,
final int editTolerance,
final boolean doReturnAfterFirstMatch,
final StringBuffer textBuffer )
{
String searchTextUP = rawSearchedText.toUpperCase();
String soundexToSearch = Soundex.GetSoundexEnglish( searchTextUP );
Vector positionVector = new Vector();
// first of all : do an exact search, because the approximate won't find everything,
// for example steph will not match stephan with an edit distance of 1
int[] exact = SearchExactTextOccurences(rawSearchedText, false, false,doReturnAfterFirstMatch, textBuffer);
if(doReturnAfterFirstMatch && exact.length>0) return exact;
for(int i=0; i<exact.length; i++)
{
positionVector.addElement(new Integer(exact[i]));
}
WordTokenizer wt = new WordTokenizer(textBuffer.toString().toUpperCase());
if(approximateSearchMethodIndex==1)
{
// Soundex
//
while(wt.hasNext())
{
String word = wt.nextWord();
String snd = Soundex.GetSoundexEnglish(word);
if(soundexToSearch.equals(snd))
{
// metric, allow only up to editDistance
int dst = EditDistance.EditDistance(word, searchTextUP, editTolerance);
if(debug) Log.Info(" "+word+", "+dst);
if(dst<=editTolerance && dst>0)
{
if(debug) Log.Info("++++++++++++ "+word+", "+dst);
// add the new position, IF not already present from exact search,
// this is reached here in excluding dst==0
// position of match
positionVector.addElement(new Integer(wt.getLastWordStartPos()));
// Size limitation : Break when we reach MaximumNumberOfHits added entries.
if( ( positionVector.size() >= MaximumNumberOfHitsPerSingleSearch ) ||
( doReturnAfterFirstMatch ) )
{
break;
}
}
}
}
}
else if(approximateSearchMethodIndex==0)
{
// Edit-Distance
//
while(wt.hasNext())
{
String word = wt.nextWord();
// directly compute distance
int dst = EditDistance.EditDistance(word, searchTextUP, editTolerance);
if(debug) Log.Info(" "+word+", "+dst);
if(dst<=editTolerance && dst>0)
{
if(debug) Log.Info("**************** "+word+", "+dst);
// add the new position, IF not already present from exact search,
// this is reached here in excluding dst==0
// position of match
positionVector.addElement(new Integer(wt.getLastWordStartPos()));
// Size limitation : Break when we reach MaximumNumberOfHits added entries.
if( ( positionVector.size() >= MaximumNumberOfHitsPerSingleSearch ) ||
( doReturnAfterFirstMatch ) )
{
break;
}
}
}
}
// return the positions found
int[] positionArray = new int[ positionVector.size() ];
for( int i=0; i < positionVector.size(); i++ )
{
positionArray[i] = ((Integer)positionVector.elementAt(i)).intValue();
}
// sort (because we parsed twice, an exact then an approximate)
Arrays.sort(positionArray);
return positionArray;
}
private static int[] SearchExactTextOccurences( final String rawSearchedText,
final boolean doCaseSensitiveSearch,
final boolean doWholeWordsSearch,
final boolean doReturnAfterFirstMatch,
final StringBuffer textBuffer )
{
/*
ystem.out.println("wwwww> ");
ystem.out.println("wwwww> SearchMethods.SearchExactTextOccurences() called");
ystem.out.println("wwwww> rawSearchedText.length()= " + rawSearchedText.length() );
ystem.out.println("wwwww> textBuffer.length()= " + textBuffer.length() );
ystem.out.println("wwwww> ");
*/
int numberOfHits = 0;
Vector positionVector = new Vector();
boolean leftBorderIsWordDelimiter, rightBorderIsWordDelimiter;
char testCharacter;
int searchedTextLength = rawSearchedText.length();
int textBufferLength = textBuffer.length();
if( searchedTextLength > 0 )
{
String searchedText;
String sourceText;
if( doCaseSensitiveSearch )
{
searchedText = rawSearchedText;
sourceText = textBuffer.toString(); // is a fast pointer assignment
}
else
{
searchedText = rawSearchedText.toLowerCase();
sourceText = textBuffer.toString().toLowerCase(); // is a fast pointer assignment
}
int searchIndex = 0;
int newSearchIndex = 0;
while( searchIndex < sourceText.length() )
{
newSearchIndex = sourceText.indexOf( searchedText,searchIndex );
if( newSearchIndex >= 0 ) // hit one
{
// quick test - temporary :
if( newSearchIndex < searchIndex )
{
Log.Error("FATAL SEARCH ERROR: z < indexOf(..,z) ABORTING");
break;
}
searchIndex = newSearchIndex;
// Add this position:
if( doWholeWordsSearch )
{
// Check the borders
leftBorderIsWordDelimiter = true;
rightBorderIsWordDelimiter = true;
if( searchIndex > 1 )
{
testCharacter = sourceText.charAt(searchIndex-1);
leftBorderIsWordDelimiter = !Character.isJavaIdentifierPart(testCharacter);
}
if( searchIndex + searchedTextLength < textBufferLength-1 )
{
testCharacter = sourceText.charAt( searchIndex + searchedTextLength );
rightBorderIsWordDelimiter = !Character.isJavaIdentifierPart(testCharacter);
}
if( leftBorderIsWordDelimiter && rightBorderIsWordDelimiter )
{
positionVector.addElement( new Integer(searchIndex) );
numberOfHits++;
}
}
else
{
// no conditions, so add it:
positionVector.addElement( new Integer(searchIndex) );
numberOfHits++;
}
// Size limitation : Break when we reach MaximumNumberOfHits added entries.
// This is IMPORTANT, cause it would be a vulnerability of the IDE, if not checked.
// There is a global check outside, which ends the search over all files,
// once an overall limit has been reached.
if( ( positionVector.size() >= MaximumNumberOfHitsPerSingleSearch ) ||
( doReturnAfterFirstMatch && ( numberOfHits > 0 ) ) )
{
break;
}
// There has been a possible match (either inside or outside the allowed intervalls)
// For the next search run, we can increment by the searchtext length therefore :
searchIndex += searchedTextLength;
}
else
{
break;
}
} // while
} // if
int[] positionArray = new int[ positionVector.size() ];
for( int i=0; i < positionVector.size(); i++ )
{
positionArray[i] = ((Integer)positionVector.elementAt(i)).intValue();
}
return positionArray;
} // SearchTextOccurences
} // SearchMethods
|