#!/usr/bin/python
# -*- coding: UTF-8 -*-

"""
    This file is part of asd.
    
    asd is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    asd Copyright 2005 Antonini Daniele <arpeda@gmail.com>

"""


import string
import struct
import sys
import re
import os.path

sys.path.append("__PATH_TO_SUBSTITUTE__/ASD/Query/")
import stemmer

dictionary = {}
stopList = {  "a" : 1, "ii" : 1, "about" : 1, "above" : 1, "according" : 1, "across" : 1, "39" : 1, "actually" : 1, "ad" : 1, 
"adj" : 1, "ae" : 1, "af" : 1, "after" : 1, "afterwards" : 1, "ag" : 1, "again" : 1, "against" : 1, "ai" : 1, "al" : 1, "all" : 1,
 "almost" : 1, "alone" : 1, "along" : 1, "already" : 1, "also" : 1, "although" : 1, "always" : 1, "am" : 1, "among" : 1, "amongst" : 1,
 "an" : 1, "and" : 1, "another" : 1, "any" : 1, "anyhow" : 1, "anyone" : 1, "anything" : 1, "anywhere" : 1, "ao" : 1, "aq" : 1, "ar" : 1,
 "are" : 1, "aren" : 1, "aren't" : 1, "around" : 1, "arpa" : 1, "as" : 1, "at" : 1, "au" : 1, "aw" : 1, "az" : 1, "b" : 1, "ba" : 1,
"bb" : 1, "bd" : 1, "be" : 1, "became" : 1, "because" : 1, "become" : 1, "becomes" : 1, "becoming" : 1, "been" : 1, "before" : 1, 
"beforehand" : 1, "begin" : 1, "beginning" : 1, "behind" : 1, "being" : 1, "below" : 1, "beside" : 1, "besides" : 1, "between" : 1, 
"beyond" : 1, "bf" : 1, "bg" : 1, "bh" : 1, "bi" : 1, "billion" : 1, "bj" : 1, "bm" : 1, "bn" : 1, "bo" : 1, "both" : 1, "br" : 1,
"bs" : 1, "bt" : 1, "but" : 1, "buy" : 1, "bv" : 1, "bw" : 1, "by" : 1, "bz" : 1, "c" : 1, "ca" : 1, "can" : 1, "can't" : 1, "cannot" : 1,
"caption" : 1, "cc" : 1, "cd" : 1, "cf" : 1, "cg" : 1, "ch" : 1, "ci" : 1, "ck" : 1, "cl" : 1, "click" : 1, "cm" : 1, "cn" : 1, "co" : 1,
"co." : 1, "com" : 1, "copy" : 1, "could" : 1, "couldn" : 1, "couldn't" : 1, "cr" : 1, "cs" : 1, "cu" : 1, "cv" : 1, "cx" : 1, "cy" : 1,
"cz" : 1, "d" : 1, "de" : 1, "did" : 1, "didn" : 1, "didn't" : 1, "dj" : 1, "dk" : 1, "dm" : 1, "do" : 1, "does" : 1, "doesn" : 1,
"doesn't" : 1, "don" : 1, "don't" : 1, "down" : 1, "during" : 1, "dz" : 1, "e" : 1, "each" : 1, "ec" : 1, "edu" : 1, "ee" : 1, "eg" : 1, 
"eh" : 1, "eight" : 1, "eighty" : 1, "either" : 1, "else" : 1, "elsewhere" : 1, "end" : 1, "ending" : 1, "enough" : 1, "er" : 1, "es" : 1, 
"et" : 1, "etc" : 1, "even" : 1, "ever" : 1, "every" : 1, "everyone" : 1, "everything" : 1, "everywhere" : 1, "except" : 1, "f" : 1,
"few" : 1, "fi" : 1, "fifty" : 1, "find" : 1, "first" : 1, "five" : 1, "fj" : 1, "fk" : 1, "fm" : 1, "fo" : 1, "for" : 1, "former" : 1,
"formerly" : 1, "forty" : 1, "found" : 1, "four" : 1, "fr" : 1, "free" : 1, "from" : 1, "further" : 1, "fx" : 1, "g" : 1, "ga" : 1, "gb" : 1,
"gd" : 1, "ge" : 1, "get" : 1, "gf" : 1, "gg" : 1, "gh" : 1, "gi" : 1, "gl" : 1, "gm" : 1, "gmt" : 1, "gn" : 1, "go" : 1, "gov" : 1, 
"gp" : 1, "gq" : 1, "gr" : 1, "gs" : 1, "gt" : 1, "gu" : 1, "gw" : 1, "gy" : 1, "h" : 1, "had" : 1, "has" : 1, "hasn" : 1, "hasn't" : 1,
"have" : 1, "haven" : 1, "haven't" : 1, "he" : 1, "he'd" : 1, "he'll" : 1, "he's" : 1, "help" : 1, "hence" : 1, "her" : 1, "here" : 1, 
"here's" : 1, "hereafter" : 1, "hereby" : 1, "herein" : 1, "hereupon" : 1, "hers" : 1, "herself" : 1, "him" : 1, "himself" : 1, "his" : 1, 
"hk" : 1, "hm" : 1, "hn" : 1, "home" : 1, "homepage" : 1, "how" : 1, "however" : 1, "hr" : 1, "ht" : 1, "htm" : 1, "html" : 1, "http" : 1,
"hu" : 1, "hundred" : 1, "i" : 1, "i'd" : 1, "i'll" : 1, "i'm" : 1, "i've" : 1, "i.e." : 1, "id" : 1, "ie" : 1, "if" : 1, "il" : 1, "im" : 1, 
"in" : 1, "inc" : 1, "inc." : 1, "indeed" : 1, "information" : 1, "instead" : 1, "int" : 1, "into" : 1, "io" : 1, "iq" : 1, "ir" : 1, "is" : 1,
"isn" : 1, "isn't" : 1, "it" : 1, "it's" : 1, "its" : 1, "itself" : 1, "j" : 1, "je" : 1, "jm" : 1, "jo" : 1, "join" : 1, "jp" : 1, "k" : 1,
"ke" : 1, "kg" : 1, "kh" : 1, "ki" : 1, "km" : 1, "kn" : 1, "kp" : 1, "kr" : 1, "kw" : 1, "ky" : 1, "kz" : 1, "l" : 1, "la" : 1, "last" : 1,
"later" : 1, "latter" : 1, "lb" : 1, "lc" : 1, "least" : 1, "less" : 1, "let" : 1, "let's" : 1, "li" : 1, "like" : 1, "likely" : 1, "lk" : 1,
"ll" : 1, "lr" : 1, "ls" : 1, "lt" : 1, "ltd" : 1, "lu" : 1, "lv" : 1, "ly" : 1, "m" : 1, "ma" : 1, "made" : 1, "make" : 1, "makes" : 1,
"many" : 1, "maybe" : 1, "mc" : 1, "md" : 1, "me" : 1, "meantime" : 1, "meanwhile" : 1, "mg" : 1, "mh" : 1, "microsoft" : 1, "might" : 1,
"mil" : 1, "million" : 1, "miss" : 1, "mk" : 1, "ml" : 1, "mm" : 1, "mn" : 1, "mo" : 1, "more" : 1, "moreover" : 1, "most" : 1, "mostly" : 1,
"mp" : 1, "mq" : 1, "mr" : 1, "mrs" : 1, "ms" : 1, "msie" : 1, "mt" : 1, "mu" : 1, "much" : 1, "must" : 1, "mv" : 1, "mw" : 1, "mx" : 1, 
"my" : 1, "myself" : 1, "mz" : 1, "n" : 1, "na" : 1, "namely" : 1, "nc" : 1, "ne" : 1, "neither" : 1, "net" : 1, "netscape" : 1, 
"never" : 1, "nevertheless" : 1, "new" : 1, "next" : 1, "nf" : 1, "ng" : 1, "ni" : 1, "nine" : 1, "ninety" : 1, "nl" : 1, "no" : 1, 
"nobody" : 1, "none" : 1, "nonetheless" : 1, "noone" : 1, "nor" : 1, "not" : 1, "nothing" : 1, "now" : 1, "nowhere" : 1, "np" : 1, "nr" : 1, 
"nu" : 1, "nz" : 1, "o" : 1, "of" : 1, "off" : 1, "often" : 1, "om" : 1, "on" : 1, "once" : 1, "one" : 1, "one's" : 1, "only" : 1, "onto" : 1, 
"or" : 1, "org" : 1, "other" : 1, "others" : 1, "otherwise" : 1, "our" : 1, "ours" : 1, "ourselves" : 1, "out" : 1, "over" : 1, "overall" : 1, 
"own" : 1, "p" : 1, "pa" : 1, "page" : 1, "pe" : 1, "per" : 1, "perhaps" : 1, "pf" : 1, "pg" : 1, "ph" : 1, "pk" : 1, "pl" : 1, "pm" : 1, 
"pn" : 1, "pr" : 1, "pt" : 1, "pw" : 1, "py" : 1, "q" : 1, "qa" : 1, "r" : 1, "rather" : 1, "re" : 1, "recent" : 1, "recently" : 1, 
"reserved" : 1, "ring" : 1, "ro" : 1, "ru" : 1, "rw" : 1, "s" : 1, "sa" : 1, "same" : 1, "sb" : 1, "sc" : 1, "sd" : 1, "se" : 1, "seem" : 1, 
"seemed" : 1, "seeming" : 1, "seems" : 1, "seven" : 1, "seventy" : 1, "several" : 1, "sg" : 1, "sh" : 1, "she" : 1, "she'd" : 1, "she'll" : 1, 
"she's" : 1, "should" : 1, "shouldn" : 1, "shouldn't" : 1, "si" : 1, "since" : 1, "site" : 1, "six" : 1, "sixty" : 1, "sj" : 1, "sk" : 1, 
"sl" : 1, "sm" : 1, "sn" : 1, "so" : 1, "some" : 1, "somehow" : 1, "someone" : 1, "something" : 1, "sometime" : 1, "sometimes" : 1, 
"somewhere" : 1, "sr" : 1, "st" : 1, "still" : 1, "stop" : 1, "su" : 1, "such" : 1, "sv" : 1, "sy" : 1, "sz" : 1, "t" : 1, "taking" : 1, 
"tc" : 1, "td" : 1, "ten" : 1, "text" : 1, "tf" : 1, "tg" : 1, "test" : 1, "th" : 1, "than" : 1, "that" : 1, "that'll" : 1, "that's" : 1, 
"the" : 1, "their" : 1, "them" : 1, "themselves" : 1, "then" : 1, "thence" : 1, "there" : 1, "there'll" : 1, "there's" : 1, "thereafter" : 1, 
"thereby" : 1, "therefore" : 1, "therein" : 1, "thereupon" : 1, "these" : 1, "they" : 1, "they'd" : 1, "they'll" : 1, "they're" : 1, 
"they've" : 1, "thirty" : 1, "this" : 1, "those" : 1, "though" : 1, "thousand" : 1, "three" : 1, "through" : 1, "throughout" : 1, "thru" : 1,
"thus" : 1, "tj" : 1, "tk" : 1, "tm" : 1, "tn" : 1, "to" : 1, "together" : 1, "too" : 1, "toward" : 1, "towards" : 1, "tp" : 1, "tr" : 1, 
"trillion" : 1, "tt" : 1, "tv" : 1, "tw" : 1, "twenty" : 1, "two" : 1, "tz" : 1, "u" : 1, "ua" : 1, "ug" : 1, "uk" : 1, "um" : 1, 
"under" : 1, "unless" : 1, "unlike" : 1, "unlikely" : 1, "until" : 1, "up" : 1, "upon" : 1, "us" : 1, "use" : 1, "used" : 1, "using" : 1, 
"uy" : 1, "uz" : 1, "v" : 1, "va" : 1, "vc" : 1, "ve" : 1, "very" : 1, "vg" : 1, "vi" : 1, "via" : 1, "vn" : 1, "vu" : 1, "w" : 1, 
"was" : 1, "wasn" : 1, "wasn't" : 1, "we" : 1, "we'd" : 1, "we'll" : 1, "we're" : 1, "we've" : 1, "web" : 1, "webpage" : 1, "website" : 1, 
"welcome" : 1, "well" : 1, "were" : 1, "weren" : 1, "weren't" : 1, "wf" : 1, "what" : 1, "what'll" : 1, "what's" : 1, "whatever" : 1, 
"when" : 1, "whence" : 1, "whenever" : 1, "where" : 1, "whereafter" : 1, "whereas" : 1, "whereby" : 1, "wherein" : 1, "whereupon" : 1, 
"wherever" : 1, "whether" : 1, "which" : 1, "while" : 1, "whither" : 1, "who" : 1, "who'd" : 1, "who'll" : 1, "who's" : 1, "whoever" : 1, 
"NULL" : 1, "whole" : 1, "whom" : 1, "whomever" : 1, "whose" : 1, "why" : 1, "will" : 1, "with" : 1, "within" : 1, "without" : 1, "won" : 1, 
"won't" : 1, "would" : 1, "wouldn" : 1, "wouldn't" : 1, "ws" : 1, "www" : 1, "x" : 1, "y" : 1, "ye" : 1, "yes" : 1, "yet" : 1, "you" : 1, 
"you'd" : 1, "you'll" : 1, "you're" : 1, "you've" : 1, "your" : 1, "yours" : 1, "yourself" : 1, "yourselves" : 1, "yt" : 1, "yu" : 1, "z" : 1, 
"za" : 1, "zm" : 1, "zr" : 1, "10" : 1, "z" : 1 };


def usage():
    return

def remove_dups(lst):
    """ Removes duplicate elements from list. Drawbacks:
        - Returns an unsorted list. 
        - Does not work with lists, dicts etc. as list elements.
    """
    list = {}
    for item in lst:
        list[item] = None
    return list.keys()

def concatenate( lst1, lst2 ):
    return remove_dups( lst1 + lst2 )

def intersect( lst1, lst2 ):
    return filter( lambda x: x in lst1, lst2 )

def difference( lst1, lst2 ):
    return filter( lambda x: x not in lst2, lst1 ) #lst1 - lst2

def removeEmptyOperation( lst1, op ):
    lst2 = [op]
    return filter( lambda x: x not in lst2, lst1 )

if not ( os.path.isfile("/var/cache/man/asd/lessico.asd") and  os.path.isfile("/var/cache/man/asd/document.asd") and os.path.isfile("/var/cache/man/asd/man_page_title.asd") ):
    sys.exit( "inverted-file not exists: execute create_asdb to create database" )

## Check parameter
##
if len(sys.argv) == 1:
    usage(  )
    sys.exit( -1 )

## Read lessico and create hash
##
fileDict = open( '/var/cache/man/asd/lessico.asd','r' )
tmpList = fileDict.readlines()
fileDict.close()

for value in tmpList:
    hashValue = string.split( value, ' ', 1 )
    dictionary[hashValue[0]] = hashValue[1]

## Get words to search
##
wordToSearchTmp = []
wordToSearchStemmed = []

for i in range( 1, len(sys.argv) ):
    parameter = string.split( sys.argv[i] )

    if len( parameter ) > 1:
        wordToSearchTmp += parameter
    else:
        wordToSearchTmp.append( sys.argv[i] )

## Filter with stop list
##
wordToSearchFiltered = [];

for word in wordToSearchTmp:
    word = word.lower();
    if stopList.has_key( word ):
        print " \033[31m*\033[0m \033[1m" + word + "\033[0m is filtered by stop-lists"
    else:
        wordToSearchFiltered.append( word );

print
    
## Stemming word
##
ps = stemmer.PorterStemmer()

for word in wordToSearchFiltered:
    word = ps.stem( word, 0, len(word)-1 )
    wordToSearchStemmed.append( word )

remove_dups( wordToSearchStemmed )

print "Your words are stemmed in ",
print wordToSearchStemmed

if not wordToSearchStemmed:
    print "No man pages found !!"
    sys.exit( 0 )

## Searching Fase
##   - checking if words are present in lessico
##
noWord = []
docIDList = []

for word in wordToSearchStemmed:
    if not dictionary.has_key( word ):
        noWord.append(word)

if len(noWord):
    print "No man pages contains:",
    for i in range( 0, len(noWord) ):
        print " " + noWord[i],
    print
    sys.exit( 0 )

## Sodisfy query
##
invertedFile = open( '/var/cache/man/asd/document.asd','r' )

numElementType = invertedFile.read( 1 )
docIDType = invertedFile.read( 1 )
frequenceType = invertedFile.read( 1 )

beginList = dictionary[wordToSearchStemmed[0]]
invertedFile.seek( int(beginList) )
elementi = invertedFile.read( struct.calcsize( numElementType ) )
elementi = struct.unpack( numElementType, elementi )[0]

for i in range( 0, elementi ):
    """ Read docID and frequence """
    coppia = invertedFile.read( struct.calcsize( docIDType ) + struct.calcsize( frequenceType ) )
    docIDList.append( struct.unpack( docIDType+frequenceType, coppia)[0] )

docIDList = map( int, docIDList )

### intersect docIDList of first word with docIDList of remaining words
##
for i in range( 1, len(wordToSearchStemmed) ):
    tmpDocIDList = []
    
    beginList = dictionary[wordToSearchStemmed[i]]
    invertedFile.seek( int(beginList) )
    elementi = invertedFile.read( struct.calcsize( numElementType ) )
    elementi = struct.unpack( numElementType, elementi )[0]
    
    for i in range( 0, elementi ):
        coppia = invertedFile.read( struct.calcsize( docIDType ) + struct.calcsize( frequenceType ) )
        tmpDocIDList.append( struct.unpack( docIDType+frequenceType, coppia)[0] )

    tmpDocIDList = map( int, tmpDocIDList )
    docIDList = intersect( docIDList, tmpDocIDList )
    
invertedFile.close()

## Load man_page_title
##
hashManPage = {}

fileManPageTitle = open( '/var/cache/man/asd/man_page_title.asd','r' )

tmpList = fileManPageTitle.readlines()

for value in tmpList:
    hashValue = string.split( value, ' ', 1 )
    hashManPage[int(hashValue[0])] = hashValue[1]

fileManPageTitle.close(  )

## Print man pages that soddisfy query
##
print
for i in range( 0, len(docIDList) ):
    print hashManPage[docIDList[i]],
