
# ----------------------------------------------------------
# advas 0.1.9
# advanced search algorithms implemented as a python class
#
# (C) 2002 - 2004 Frank Hofmann, Chemnitz, Germany
# email fh@efho.de
# ----------------------------------------------------------

# updated 2004-08-29

# other modules required by advas
import string
import re
import math

class Advas:

	def __init__ (self):
		"init Advas"

		self.init_filename()
		self.init_line()
		self.init_words()
		self.init_list()
		self.init_ngrams()

	def reinit (self):
		"re-initializes an Advas object"
		self.__init__()

	def set_state (self, state):
		"set internal state of an Advas object"

		if (state.has_key("filename")): self.set_filename(state["filename"])
		if (state.has_key("use_filename")): 
			if(state["use_filename"]):
				self.set_use_filename()
			else:
				self.set_use_wordlist()
		if (state.has_key("line")): self.set_line(state["line"])
		if (state.has_key("words")): self.set_words(state["words"])
		if (state.has_key("list")): self.set_list(state["list"])
		if (state.has_key("ngram_size")): self.set_ngram_size(state["ngram_size"])

		return

	def get_state (self):
		"returns internal state of an Advas object"

		state = {
			"filename" 	: self.get_filename(),
			"use_filename" 	: self.get_use_filename(),
			"line" 		: self.get_line(),
			"words" 	: self.get_words(),
			"list"		: self.get_list(),
			"ngram_size"	: self.get_ngram_size()
		}

		return state

	# basic functions ==========================================
	# file name ------------------------------------------------

	def init_filename (self):
		self.filename = ""
		self.use_filename = 0
		
	def set_filename (self, filename):
		self.filename = filename

	def get_filename (self):
		return self.filename

	def set_use_filename (self):
		self.use_filename = 1

	def get_use_filename (self):
		return self.use_filename

	def set_use_wordlist (self):
		self.use_filename = 0

	def get_file_contents (self, file_name):
		try:
			file_id = open(file_name, "r")
		except:
			print "[AdvaS] I/O Error - can't open given file:", file_name
			return -1

		# get file contents
		contents = file_id.readlines()

		# close file
		file_id.close()

		return contents
	
	# line -----------------------------------------------------

	def init_line (self):
		self.line = ""
	
	def set_line (self, line):
		self.line = line

	def get_line (self):
		return self.line

	def split_line (self):
		"split a line of text into single words"

		# define regexp tokens and split line
		tokens = re.compile(r"[\w']+")
		self.words = tokens.findall(self.line)

	# words ----------------------------------------------------

	def init_words (self):
		self.words = {}

	def set_words (self, words):
		self.words = words
		
	def get_words (self):
		return self.words

	def count_words(self):
		"count words given in self.words, return pairs word:frequency"

		list = {}	# start with an empty list

		for item in self.words:
			# assume a new item
			frequency = 0
			
			# word already in list?
			if list.has_key(item):
				frequency = list[item]
			frequency += 1

			# save frequency , update list
			list[item] = frequency

		# save list of words
		self.set_list (list)

	# lists ----------------------------------------------------

	def init_list (self):
		self.list = {}

	def set_list (self, list):
		self.list = list

	def get_list (self):
		return self.list

	def merge_lists(self, *lists):
		"merge lists of words"

		newlist = {} 	# start with an empty list

		for current_list in lists:
			key = current_list.keys()
			for item in key:
				# assume a new item
				frequency = 0
				
				# item already in newlist?
				if newlist.has_key(item):
					frequency = newlist[item]

				frequency += current_list[item]
				newlist[item] = frequency
		# set list
		self.set_list (newlist)

	def merge_lists_idf(self, *lists):
		"merge lists of words for calculating idf"

		newlist = {}

		for current_list in lists:
			key = current_list.keys()
			for item in key:
				# assume a new item
				frequency = 0
				
				# item already in newlist?
				if newlist.has_key(item):
					frequency = newlist[item]
				frequency += 1
				newlist[item] = frequency
		# set list
		self.set_list (newlist)

	def compact_list(self):
		"merges items appearing more than once"

		newlist = {}
		original = self.list
		key = original.keys()

		for j in key:
			item = string.lower(string.strip(j))

			# assume a new item
			frequency = 0

			# item already in newlist?
			if newlist.has_key(item):
				frequency = newlist[item]
			frequency += original[j]
			newlist[item] = frequency

		# set new list
		self.set_list (newlist)

	def remove_items(self, remove):
		"remove the items from the original list"

		newlist = self.list

		# get number of items to be removed
		key = remove.keys()

		for item in key:
			# item in original list?
			if newlist.has_key(item):
				del newlist[item]

		# set newlist
		self.set_list(newlist)

	# string function ------------------------------------------

	def cmp_strings(self, term1, term2):
		"compares two strings"

		# returns 0 if equal, -1 if term1 < term2, 1 else

		# get length of the terms
		len1 = len(term1)
		len2 = len(term2)
		len_check = len1

		# equal both terms
		if (len1>len2):
			term2 = term2 + " "*(len1-len2)
			len_check = len1

		if (len2>len1):
			term1 = term1 + " "*(len2-len1)
			len_check = len2

		for i in range(len_check):
			if term1[i]<term2[i]:
				return -1

			if term1[i]>term2[i]:
				return 1

		return 0

	# n-gram functions -----------------------------------------

	def init_ngrams(self):
		self.ngram_size = 2

	def set_ngram_size(self, size):
		self.ngram_size = size

	def get_ngram_size(self):
		return self.ngram_size

	def get_ngrams(self, term):
		"returns n-grams of size n"

		ngrams = []
		term_length = len(term)
		size = self.ngram_size

		if (size>term_length):
			# we can't form any n-grams - term too small for given size
			return term
		if (size<2):
			# we can't form any n-grams - size must be at least +2
			return term

		# define left and right boundaries
		left = 0
		right = left + size

		while (right<=term_length):
			slice = term[left:right]
			ngrams.append(slice)

			# move slice to the right
			left += 1
			right += 1

		# calculate term frequency
		self.set_words(ngrams)
		self.count_words()
		dict = self.get_list()

		return dict.keys()

	# successor variety ----------------------------------------

	def calc_succ_variety (self):
		"calculates the successor variety for a given number of words"

		# define successor variety
		alphabet = "abcdefghijklmnopqrstuvwxyz@' "
		letter = {}
		for i in alphabet:
			# create a dictionary for each letter
			succ_variety = {}

			# calc successor variety for each combination of letters
			for j in alphabet:
				# letter is not following
				succ_variety[j] = 0 

			letter[i] = succ_variety

		# check for file
		flag = self.get_use_filename()
		if (flag):
			# open given file for reading
			file_name = self.get_filename()
			word_list = self.get_file_contents(file_name)
			if (word_list == -1):
				return letter
			# end if
		else:
			word_list = self.get_words()
		
		list_length = len(word_list)
		outer_list_range = range(0, list_length)

		for i in outer_list_range:
			# look at each word, remove upper letters and special chars
			original = string.lower(word_list[i])
			self.set_line(original)
			self.split_line()
			without_spec = self.get_words()

			# add additional blank, needed for calculating variety
			term = without_spec[0] + " "
			term_length = len(term)

			inner_range = range(0, term_length - 1)
			for j in inner_range:
				current_letter = term[j]
				next_letter = term[j+1]
				# print '%s %s = 1' % (current_letter, next_letter)

				# update letter table
				letter[current_letter][next_letter] = 1

		# calculate variety for each letter
		for i in alphabet:
			value = 0
			for j in alphabet:
				value += letter[i][j]

			letter[i] = value
		# end for

		# return successor variety for given list
		return letter

	# r s v ----------------------------------------------------

	def calc_rsv (self, d, p, q):
		"calculates the document weight for document descriptors"

		# rsv: return status value
		# d: list of existance (1) or non-existance (0)
		# p, q: list of probabilities of existance (p) and non-existance (q)

		items_p = len(p)
		items_q = len(q)
		items_d = len(d)

		if ((items_p - items_q) <> 0):
			# different length of lists p and q
			return 0

		if ((items_d - items_p) <> 0):
			# different length of lists d and p
			return 0
		# define rsv
		rsv = 0

		for i in range(items_p):
			eq_upper = float(p[i]) / float(1-p[i])
			eq_lower = float(q[i]) / float(1-q[i])

			value = float(d[i] * math.log (eq_upper / eq_lower))

			rsv = rsv + value

		return rsv

	def convert_dictionary_into_list(self, original):
		"converts a dictionary into a list"
	
		result = []
	
		key = original.keys()
		for item in key:
			result.append(item)
	
		return result

	def convert_list_into_dictionary(self, original, init_value):
		"converts a list into a dictionary"
	
		result = {}
	
		list_size = len(original)
		for item in range(list_size):
			left_value = original[item]
			result[left_value] = init_value
	
		return result

	def is_comment (self, line):
		"verifies a line for being a comment"

		# remove any whitespace at the beginning
		new_line = string.lstrip(line)

		# is comment?
		if line.startswith("#"):
			return 1
		else:
			return 0

	# advanced functions =======================================
	# term frequency (tf) --------------------------------------

	def tf (self, text):
		"calculates the term frequency for a given text"

		# split this line into single words
		self.set_line(text)
		self.split_line()

		# count the words and create a list word:frequency
		self.count_words()

		# return list of words and their frequency
		return self.list

	def tf_stop (self, text, stop_list):
		"calculates the term frequency and removes the items given in stop list"
		# text : a line of text
		# stop list : dictionary

		# get term frequency
		list = self.tf(text)

		# remove items given in stop list
		self.remove_items (stop_list)

		# return result
		return self.get_list()

	def idf (self, documents, word_list):
		"calculates the inverse document frequency for a given list of terms"

		new_list = {}
		key = word_list.keys()

		for item in key:
			frequency = word_list[item]

			# calculate idf = ln(N/n):
			# N=number of documents
			# n=number of documents that contain term
			idf = math.log(documents/frequency)

			new_list[item] = idf

		return new_list

	# sound-like algorithms ------------------------------------

	def soundex(self, term):
		"Return the soundex value to a string argument."

		# Create and compare soundex codes of English words.
		#
		# Soundex is an algorithm that hashes English strings into
		# alpha-numerical value that represents what the word sounds
		# like. For more information on soundex and some notes on the
		# differences in implemenations visit:
		# http://www.bluepoof.com/Soundex/info.html
		#
		# This version modified by Nathan Heagy at Front Logic Inc., to be
		# compatible with php's soundexing and much faster.
		#
		# eAndroid / Nathan Heagy / Jul 29 2000

		# generate translation table only once. used to translate into soundex numbers
		table = string.maketrans('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '0123012002245501262301020201230120022455012623010202')

		# check parameter
		if not term:
			return "0000" # could be Z000 for compatibility with other implementations
		# must be uppercase
		first_char = string.upper(term[0])

		# translate the string into soundex code according to the table above
		term = string.translate(term, table)

		# remove duplicate numbers in-a-row
		str2 = " "
		for x in term:
			if x != str2[-1]:
				str2 = str2 + x
		# remove all 0s
		str2 = string.replace(str2,"0","")
    
		# replace the " " and the first number with the first letter
		str2 = first_char + str2[2:]

		# pad with zeros
		str2 = str2+"0"*(4-len(str2))

		return_value = str2[:4]

		return return_value

	def metaphone(self, term):
		"returns metaphone code for a given string"

		# define return value
		code = ""

		i = 0
		term_length = len(term)

		if (term_length == 0):
			# empty string ?
			return code
		# end if
	
		# check for exceptions
		if (term_length > 1):
			# get first two characters
			first_chars = term[0:2]

			# build translation table
			table = {
				"ae": "e",
				"gn": "n",
				"kn": "n",
				"pn": "n",
				"wr": "n",
				"wh": "w",
			}

			if first_chars in table.keys():
				term = term[2:]
				code = table[first_chars]
				term_length = len(term)
			# end if
		
		elif (term[0] == "x"):
			term = ""
			code = "s"
			term_length = 0
		# end if

		# define standard translation table
		st_trans = {
			"b": "b",
			"c": "k",
			"d": "t",
			"g": "k",
			"h": "h",
			"k": "k",
			"p": "p",
			"q": "k",
			"s": "s",
			"t": "t",
			"v": "f",
			"w": "w",
			"x": "ks",
			"y": "y",
			"z": "s"
		}

		i = 0
		while (i<term_length):
			# init character to add, init basic patterns
			add_char = ""
			part_n_2 = ""
			part_n_3 = ""
			part_n_4 = ""
			part_c_2 = ""
			part_c_3 = ""

			# extract a number of patterns, if possible
			if (i < (term_length - 1)):
				part_n_2 = term[i:i+2]

				if (i>0):
					part_c_2 = term[i-1:i+1]
					part_c_3 = term[i-1:i+2]
				# end if
			# end if

			if (i < (term_length - 2)):
				part_n_3 = term[i:i+3]
			# end if

			if (i < (term_length - 3)):
				part_n_4 = term[i:i+4]
			# end if

			# use table with conditions for translations
			if (term[i] == "b"):
				add_char = st_trans["b"]
				if (i == (term_length - 1)):
					if (i>0):
						if (term[i-1] == "m"):
							add_char = ""
						# end if
					# end if
				# end if
			elif (term[i] == "c"):
				add_char = st_trans["c"]
				if (part_n_2 == "ch"):
					add_char = "x"
				elif (re.search(r'c[iey]', part_n_2)):
					add_char = "s"
				# end if

				if (part_n_3 == "cia"):
					add_char = "x"
				# end if

				if (re.search(r'sc[iey]', part_c_3)):
					add_char = ""
				# end if

			elif (term[i] == "d"):
				add_char = st_trans["d"]
				if (re.search(r'dg[eyi]', part_n_3)):
					add_char = "j"
				# end if

			elif (term[i] == "g"):
				add_char = st_trans["g"]

				if (part_n_2 == "gh"):
					if (i == (term_length - 2)):
						add_char = ""
					# end if
				elif (re.search(r'gh[aeiouy]', part_n_3)):
					add_char = ""
				elif (part_n_2 == "gn"):
					add_char = ""
				elif (part_n_4 == "gned"):
					add_char = ""
				elif (re.search(r'dg[eyi]',part_c_3)):
					add_char = ""
				elif (part_n_2 == "gi"):
					if (part_c_3 != "ggi"):
						add_char = "j"
					# end if
				elif (part_n_2 == "ge"):
					if (part_c_3 != "gge"):
						add_char = "j"
					# end if
				elif (part_n_2 == "gy"):
					if (part_c_3 != "ggy"):
						add_char = "j"
					# end if
				elif (part_n_2 == "gg"):
					add_char = ""
				# end if
			elif (term[i] == "h"):
				add_char = st_trans["h"]
				if (re.search(r'[aeiouy]h[^aeiouy]', part_c_3)):
					add_char = ""
				elif (re.search(r'[csptg]h', part_c_2)):
					add_char = ""
				# end if
			elif (term[i] == "k"):
				add_char = st_trans["k"]
				if (part_c_2 == "ck"):
					add_char = ""
				# end if
			elif (term[i] == "p"):
				add_char = st_trans["p"]
				if (part_n_2 == "ph"):
					add_char = "f"
				# end if
			elif (term[i] == "q"):
				add_char = st_trans["q"]
			elif (term[i] == "s"):
				add_char = st_trans["s"]
				if (part_n_2 == "sh"):
					add_char = "x"
				# end if

				if (re.search(r'si[ao]', part_n_3)):
					add_char = "x"
				# end if
			elif (term[i] == "t"):
				add_char = st_trans["t"]
				if (part_n_2 == "th"):
					add_char = "0"
				# end if

				if (re.search(r'ti[ao]', part_n_3)):
					add_char = "x"
				# end if
			elif (term[i] == "v"):
				add_char = st_trans["v"]
			elif (term[i] == "w"):
				add_char = st_trans["w"]
				if (re.search(r'w[^aeiouy]', part_n_2)):
					add_char = ""
				# end if
			elif (term[i] == "x"):
				add_char = st_trans["x"]
			elif (term[i] == "y"):
				add_char = st_trans["y"]
			elif (term[i] == "z"):
				add_char = st_trans["z"]
			else:
				# alternative
				add_char = term[i]
			# end if

			code = code + add_char
			i += 1
		# end while

		# return metaphone code
		return code

		def nysiis (term):
		"returns New York State Identification and Intelligence Algorithm (NYSIIS) code for the given term"

		code = ""

		i = 0
		term_length = len(term)

		if (term_length == 0):
			# empty string ?
			return code
		# end if

		# build translation table for the first characters
		table = {
			"mac":"mcc",
			"ph":"ff",
			"kn":"nn",
			"pf":"ff",
			"k":"c",
			"sch":"sss"
		}

		for table_entry in table.keys():
			table_value = table[table_entry]	# get table value
			table_value_len = len(table_value)	# calculate its length
			first_chars = term[0:table_value_len]
			if (first_chars == table_entry):
				term = table_value + term[table_value_len:]
				break
			# end if
		# end for

		# build translation table for the last characters
		table = {
			"ee":"y",
			"ie":"y",
			"dt":"d",
			"rt":"d",
			"rd":"d",
			"nt":"d",
			"nd":"d",
		}

		for table_entry in table.keys():
			table_value = table[table_entry]	# get table value
			table_entry_len = len(table_entry)	# calculate its length
			last_chars = term[(0 - table_entry_len):]
			#print last_chars, ", ", table_entry, ", ", table_value
			if (last_chars == table_entry):
				term = term[:(0 - table_value_len + 1)] + table_value
				break
			# end if
		# end for

		# initialize code
		code = term

		# transform ev->af
		code = re.sub(r'ev', r'af', code)

		# transform a,e,i,o,u->a
		code = re.sub(r'[aeiouy]', r'a', code)
	
		# transform q->g
		code = re.sub(r'q', r'g', code)
	
		# transform z->s
		code = re.sub(r'z', r's', code)

		# transform m->n
		code = re.sub(r'm', r'n', code)

		# transform kn->n
		code = re.sub(r'kn', r'n', code)

		# transform k->c
		code = re.sub(r'k', r'c', code)

		# transform sch->sss
		code = re.sub(r'sch', r'sss', code)

		# transform ph->ff
		code = re.sub(r'ph', r'ff', code)

		# transform h-> if previous or next is nonvowel -> previous
		occur = re.findall(r'([a-z]{0,1}?)h([a-z]{0,1}?)', code)
		#print occur
		for occur_group in occur:
			occur_item_previous = occur_group[0]
			occur_item_next = occur_group[1]

			if ((re.match(r'[^aeiouy]', occur_item_previous)) or (re.match(r'[^aeiouy]', occur_item_next))):
				if (occur_item_previous != ""):
					# make substitution
					code = re.sub (occur_item_previous + "h", occur_item_previous * 2, code, 1)
				# end if
			# end if
		# end for
	
		# transform w-> if previous is vowel -> previous
		occur = re.findall(r'([aeiouy]{1}?)w', code)
		#print occur
		for occur_group in occur:
			occur_item_previous = occur_group[0]
			# make substitution
			code = re.sub (occur_item_previous + "w", occur_item_previous * 2, code, 1)
		# end for
	
		# check last character
		# -s, remove
		code = re.sub (r's$', r'', code)
		# -ay, replace by -y
		code = re.sub (r'ay$', r'y', code)
		# -a, remove
		code = re.sub (r'a$', r'', code)
	
		# return nysiis code
		return code

	# table lookup stemmer -------------------------------------

	def table_lookup_stemmer(self, term, stem_file):
		"return the term's stem given in a stem file"

		# stem file format (sorted alphabetically):
		# term : stem

		# define empty stem
		stem = ""
		contents = self.get_file_contents(stem_file)
		if (contents == -1):
			# can't read from given file
			return stem
		# end if

		# search for given term (binary search)
		right = len(contents) - 1
		left = 0

		while (right>=left):
			middle = (right + left) / 2

			# extract item
			pattern = re.compile(r"\w+(?=[:\s,])")
			item_list = re.findall(pattern, contents[middle])

			# extract item from list
			item = item_list[0]

			# compare item and term
			result = self.cmp_strings(term, item)

			if (result==0):
				# match - return correct stem
				stem = item_list[1]
				return stem
			elif (result==-1):
				# mismatch, somewhere before
				right = middle - 1
			elif (result==1):
				# mismatch, somewhere later
				left = middle + 1
		return stem

	def successor_variety_stemmer (self, term, word_list, flag):
		"calculates the terms'stem according to the successor variety algorithm"

		if (flag):
			self.set_filename(word_list)
			self.set_use_filename()
		else:
			self.set_words(word_list)
			self.set_use_wordlist()
		list = self.calc_succ_variety ()

		# examine given term
		# use peak-and-plateau method to found word boundaries
		term_length = len(term)
		term_range = range(1, term_length-1)

		# start here
		start=0

		# list of stems
		stem_list = []

		for i in term_range:
			word_slice = term[start:i+1]

			# check for a peak
			A = term[i-1]
			B = term[i]
			C = term[i+1]
			if (list[B]>list[A]) and (list[B]>list[C]):
				# save slice as a stem
				stem_list.append(word_slice)

				# adjust start
				start=i+1

		if (i<term_length):
			# still something left in buffer?
			word_slice = term[start:]
			stem_list.append(word_slice)

		return stem_list

	# n-gram functions -----------------------------------------

	def comp_ngrams (self, term1, term2, size):
		"compares two terms and returns their degree of equality"

		# equality of terms after Dice
		# 
		# S = 2C/(A+B)
		# 
		# S = degree of equality
		# C = n-grams contained in term 2 as well as in term 2
		# A = number of n-grams contained in term 1
		# B = number of n-grams contained in term 2

		# get n-grams for term1 and term2
		self.set_ngram_size(size)
		list1 = self.get_ngrams(term1)
		list2 = self.get_ngrams(term2)

		# find n-grams contained in both lists
		A = len(list1)
		B = len(list2)

		# transform both lists into dictionaries
		self.set_words(list1)
		list1_dict = count_words()
		self.set_words(list2)
		list2_dict = count_words()

		# extract the keys which appear in both list1 and list2
		list3 = filter(list1_dict.has_key, list2_dict.keys())

		# convert this list in a dictionary and count the number of keys
		dict = self.convert_list_into_dictionary(list3, 0)
		C = len(dict.keys())

		# calculate similarity of term 1 and 2
		S = float(float(2*C)/float(A+B))

		# return similarity
		return S

	def ngram_stemmer (self, word_list, size, equality):
		"reduces word_list according to the n-gram stemming method"

		# use return_list and stop_list for the terms to be removed, later
		return_list = []
		stop_list = []

		# calculate length and range
		list_length = len(word_list)
		outer_list_range = range(0, list_length)

		for i in outer_list_range:
			term1 = word_list[i]
			inner_list_range = range (0, i)

			for j in inner_list_range:
				term2 = word_list[j]

				# calculate n-gram value
				ngram_value = self.comp_ngrams(term1, term2, size)

				degree = ngram_value - equality
				if (degree>0):
					# these terms are so similar that they can be conflated
					# remove the longer term, keep the shorter one
					if (len(term2)>len(term1)):
						stop_list.append(term2)
					else:
						stop_list.append(term1)

		# conflate the matrix
		# extract all the items which do not appear in stop_list
		# work with dictionaries instead of lists
		return_dict = self.convert_list_into_dictionary(word_list, 0)
		stop_dict = self.convert_list_into_dictionary(stop_list, 0)
		new_list = []

		# check stop_list
		stop_list_length = len(stop_list)
		stop_list_range = range(0, stop_list_length)

		for item in return_dict.keys():
			if not stop_dict.has_key(item):
				new_list.append(item)
			# end if
		# end for

		# return conflated word list
		return new_list

	# vector functions -------------------------------------------------

	def kNN(self, vector_1, vector_2):
		"k-Nearest Neighbour algorithm"

		first_list = vector_1
		other_list = vector_2

		global_distance = float(0)
		for item in first_list:
			first_value = float(first_list[item])
			other_value = float(0)
			if (other_list.has_key(item)):
				other_value = float(other_list[item])
			i = float(first_value - other_value)
			local_distance = float(i * i)
			global_distance = global_distance + local_distance

		for item in other_list:
			other_value = float(other_list[item])
			first_value = 0
			if (first_list.has_key(item)):
				continue	# don't count again
			local_distance = float(other_value * other_value)
			global_distance += local_distance
	
		kNN = math.sqrt(global_distance)

		return kNN

	def comp_descriptors (self, request, document):
		"returns the degree of equality between a request and a document"

		# request, document: lists of descriptors
		# return value: float, between 0 and 1

		equality = 0
		items_request = len(request)
		items_document = len(document)

		# calc similar descriptors
		request_dict = self.convert_list_into_dictionary(request, 0)
		document_dict = self.convert_list_into_dictionary(document, 0)
		similar_descriptors = 0

		for item in request_dict.keys():
			if document_dict.has_key(item):
				similar_descriptors += 1
			# end if
		# end for

		# calc equality
		equality = float(similar_descriptors) / float ((math.sqrt(items_request) * math.sqrt(items_document)))

		# return degree of equality
		return equality

	def rank (self, request, document_list, order):
		"ranks the given documents according to the equality of their descriptors with the request"

		ranking_list = []
		list_no = 0
		for document in document_list:
			equality = self.comp_descriptors (request, document)
			ranking_entry = {
				"descriptors" : document,
				"equality" : equality,
				"list_no" : list_no
			}
			list_no += 1

			# search for an appropiate place to insert new entry (binary search)
			list_length = len(ranking_list) - 1
			right = list_length
			left = 0

			if(right == -1):
				# still an empty ranking list
				ranking_list.append(ranking_entry)
			else:
				if (ranking_list[left]["equality"] <= equality):
					ranking_list = [ranking_entry] + ranking_list
					continue
				
				if (ranking_list[right]["equality"] >= equality):
					ranking_list.append(ranking_entry)
					continue

				while (right>left):
					middle = (right + left) / 2
					value = ranking_list[middle]["equality"]

					if (value <= equality):
						right = middle
					else:
						left = middle + 1
			
				ranking_list = ranking_list[:middle+1] + [ranking_entry] + ranking_list[middle+1:]

		if (order == 1):
			# not descending
			new_ranking_list = []
			for item in ranking_list:
				new_ranking_list = [item] + new_ranking_list
			ranking_list = new_ranking_list

		return ranking_list

	# language detection ----------------------------------------

	def is_language(self, text):
		"given text is written in a certain language"

		# old function - substituted by is_language_by keywords()
		return self.is_language_by_keywords (text)

	def is_language_by_keywords(self, text):
		"determine the language of a given text with the use of keywords"

		value = 0

		# check for file
		flag = self.get_use_filename()
		if (flag):
			# open given file for reading
			file_name = self.get_filename()
			word_list = self.get_file_contents(file_name)
			if (word_list == -1):
				# can't read from given file
				return value
			# end if
		else:
			word_list = self.get_words()

		# get list of words using tf
		text_tf = self.tf (text)
		self.compact_list()
		text_tf = self.get_list()

		# lower each word
		list_size = len(word_list)
		for i in range(list_size):
			word_list[i] = string.lower(string.strip(word_list[i]))

		# verify each item : in stop_list?
		line_language = 0

		for item in word_list:
			if (text_tf.has_key(item)):
				line_language += 1

		value = float(float(line_language)/float(list_size))

		return value

	# string search ----------------------------------------------------

	def kmp_search (self, text, pattern):
		"search pattern in a text using Knuth-Morris-Pratt algorithm"

		i = 0
		j = -1
		next = {0: -1}

		# initialize next array
		while 1:
			if ((j == -1) or (pattern[i] == pattern[j])):
				i = i + 1
				j = j + 1
				next[i] = j
			else:
				j = next[j]
			# end if

			if (i >= len(pattern)):
				break
		# end while

		# search
		i = 0
		j = 0
		positions = []
	
		while 1:
			if ((j == -1) or (text[i] == pattern[j])):
				i = i + 1
				j = j + 1
			else:
				j = next[j]
			# end if

			if (i >= len(text)):
				return positions
			# end if

			if (j >= len(pattern)):
				positions.append(i - len(pattern))
				i = i - len(pattern) + 1
				j = 0
			# end if
		# end while

	# synonym functions -----------------------------------------

	def synonym_of (self, term, dictionary_file):
		"returns the synonyms of the given term"

		# works with OpenThesaurus (plain text version)
		# requires an OpenThesaurus release later than 2003-10-23
		# requires lines sorted in alphabetical order
		# http://thesaurus.kdenews.org

		word_list = self.get_file_contents (dictionary_file)
		if (word_list==-1) or (term == ""):
			# can't read from given file name or an empty term
			return []
		# end if

		# ignore comments at the beginning
		list_length = len(word_list) - 1
		right = list_length
		left = 0
		while (right>left):
			if self.is_comment(word_list[left]):
				left += 1
			else:
				break
		# end while

		list_left = left
		list_right = right
	
		# use a binary search to find the given item (index term)
		while (right>left):
			middle = (right + left) / 2

			# extract line and remove any whitespace at the beginning
			line = string.lstrip(string.rstrip(word_list[middle]))

			# split into chunks
			pieces = line.split(";")

			# compare ...
			v = self.cmp_strings(pieces[0], term)
			# print pieces[0], term, v
			if (v==0):
				# both strings are equal - find lines containing the same index term
				# an item can have several meanings
				left = middle - 1
				right = middle + 1
				while (left>0):
					line = string.lstrip(string.rstrip(word_list[left]))
					pieces = line.split(";")
					if(self.cmp_strings(pieces[0], term)==0):
						left -= 1
					else:
						break
					# end if
				# end while

				while (right<list_length):
					line = string.lstrip(string.rstrip(word_list[right]))
					pieces = line.split(";")
					if(self.cmp_strings(pieces[0], term)==0):
						right += 1
					else:
						break
					# end if
				# end while
			
				# get all relevant lines 
				synonyms = word_list[left+1:right]

				# remove space and special chars
				for i in range(len(synonyms)):
					synonyms[i] = string.lstrip(string.rstrip(synonyms[i]))
				# end for

				return synonyms

			elif (v==-1):
				# term follows piece
				left = middle + 1
			else:
				# piece follows term
				right = middle
			# end if
		# end while

		# still not found - check each line
		synonyms = []
	
		i = list_left
		while i < list_length:
			line = string.lstrip(string.rstrip(word_list[i]))
			pieces = line.split(";")
		
			if term in pieces:
				synonyms.append(line)
			# end if
		
			i += 1
		# end while
	
		return synonyms
	
	def is_synonym_of (term1, term2, dictionary_file):
		"returns true if term1 and term2 have the same meaning"

		# works with OpenThesaurus (plain text version)
		# requires an OpenThesaurus release later than 2003-10-23
		# requires lines sorted in alphabetical order
		# http://thesaurus.kdenews.org

		word_list = self.get_file_contents (dictionary_file)
		if (word_list==-1) or (term1 == "") or (term2 == ""):
			# can't read from given file name or compare empty terms
			return 0
		# end if

		# ignore comments at the beginning
		list_length = len(word_list) - 1
		right = list_length
		left = 0
		while (right>left):
			if self.is_comment(word_list[left]):
				left += 1
			else:
				break
		# end while

		list_left = left
		list_right = right
	
		i = list_left
		while i < list_length:
			line = string.lstrip(string.rstrip(word_list[i]))
			pieces = line.split(";")
		
			if ((term1 in pieces) and (term2 in pieces)):
				return 1
			# end if
		
			i += 1
		# end while

		return 0
	
	# category functions ----------------------------------------

	def category_make_tree (self, category_string):
		"convert a category string in a category tree"

		tree = []
		id = 1

		# create ROOT node
		root_node = self.category_make_node ("ROOT")
		root_node["up"] = 0
		root_node["root"] = 1
		tree.append(root_node)
	
		splitted_string = self.category_split_string(category_string)
		for item in splitted_string:
			# create node
			node = self.category_make_node (item)
	
			# link previous node and current node ... if not root node
			node["up"] = id - 1
			tree[id - 1]["next"] = [id]
		
			# add node to tree
			tree.append(node)
			id = id + 1
		# end for

		return tree

	def category_make_node (self, node_name):
		"create an empty category node"

		node = {
			"name": node_name,
			"next": [],
			"up"  : 0,
			"root": 0
		}
		return node

	def category_split_string (self, category_string):
		"split the category string into several chunks"

		levels = string.split(category_string, "/")
		return levels

	def category_is_root_node (self, node):
		"checks a node for being a root node"

		if(node["root"] == 1):
			return 1
		else:
			return 0
		# end if

	def category_is_leaf_node (self, node):
		"checks a node for being a leaf node"

		if(node["next"] == []):
			return 1
		else:
			return 0
		# end if

	def category_get_root_node (self, tree):
		"returns the root node of the given tree"

		for i in range(len(tree)):
			node = tree[i]
			if(self.category_is_root_node(node)):
				return node
			# end if
		# end for

		return -1
