import straw, error, re, string, sgmllib, email.Utils
from mx import DateTime
from mx.DateTime.Parser import DateTimeFromString

class ImageParser(sgmllib.SGMLParser):
    def __init__(self, feed):
        sgmllib.SGMLParser.__init__(self)
        self._feed = feed
        self._image_urls = []

    def do_img(self, attrs):
        for name, value in attrs:
            if name == 'src':
                url = straw.utils.complete_url(value, self._feed.location)
                self._image_urls.append(url)

    def get_image_urls(self):
        return self._image_urls

def unicode_field(dict, key, enc, default=''):
    v = dict.get(key, default)
    if v != '':
        try:
            v = unicode(v, enc)
        except (ValueError, UnicodeError):
            # try with iso-8859-1, usually we get at least something
            enc = "iso-8859-1"
            v = unicode(v, enc).strip()
    return v

iso_date_re = re.compile('^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d')
def read_date(s):
    s = s.strip()
    try:
        if iso_date_re.match(s):
            d = DateTimeFromString(s, ('iso',))
        else:
            d = DateTimeFromString(s, ('lit',))
    except ValueError:
        d = DateTime.now()
    return d

ws_re = re.compile("\s+")
def dewhitespacify(s):
    return ws_re.sub(" ", s.strip())

xmlheaderRe = re.compile(r'<\?.*encoding=(["\'])(?P<encoding>.*?)\1.*\?>')
def parse(content, feed):
    parser = straw.FeedParser()
    parsed = straw.ParsedSummary()
    parser.feed(content)

    enc = parser.channel.get('encoding', '')

    if enc == '':
        match = xmlheaderRe.match(content)
        if match:
            enc =  parser.channel['encoding'] = match.group('encoding').lower()
        else:
            enc = straw.utils.get_encoding()

    parsed.title = dewhitespacify(unicode_field(parser.channel, "title", enc))
    parsed.description = dewhitespacify(
        unicode_field(parser.channel, "description", enc))
    parsed.link = dewhitespacify(unicode_field(parser.channel, "link", enc))
    parsed.copyright = dewhitespacify(
        unicode_field(parser.channel, "rights", enc))
    parsed.last_build_date = read_date(parser.channel.get("date", ""))

    if parser.channel.has_key('creator'):
        parsed.creator = dewhitespacify(
            unicode_field(parser.channel, "creator", enc))

    # item properties
    for idict in parser.items:
        item = straw.SummaryItem()
        description = ''
        item.feed = feed
        item.title = dewhitespacify(unicode_field(idict, 'title', enc))

        if idict.has_key('link'):
            item.link = dewhitespacify(unicode_field(idict, 'link', enc))
        if idict.has_key('content_encoded'):
            description = unicode_field(idict, 'content_encoded', enc)
        elif idict.has_key('description'):
            description = unicode_field(idict, 'description', enc)

        if len(description):
            imp = ImageParser(feed)
            item.description = description
            try:
                imp.feed(description)
                image_urls = imp.get_image_urls()
                for im in image_urls:
                    item.add_image(im)
            except Exception, ex:
                error.log(_("Exception occurred %s in feed %s") % (ex, item.feed.title))
                item.feed.error = _("Error parsing item %s: %s") % (item.title, ex)

        # dc:creator
        if idict.has_key('creator'):
            item.creator = dewhitespacify(unicode_field(idict, 'creator', enc))

        if idict.has_key('guid'):
            item.guid = unicode_field(idict, 'guid', enc)

        if idict.has_key('date'):
            item.pub_date = read_date(idict.get('date', ""))

        if idict.has_key('source'):
            st = dewhitespacify(unicode_field(idict, 'source', enc))
            # source format: "url,name"   e.g. 'http://foo.com,Foo'
            url, text = string.split(st, ',', 1 )
            item.source = {'url': url.strip(), 'text': text}

        if idict.has_key('license'):
            # freshmeat
            if parser.namespacemap.has_key('fm'):
                item.fm_license = unicode_field(idict, 'license', enc)
                item.fm_changes = unicode_field(idict, 'changes', enc)
            else:
                license = unicode_field(idict, "license", enc)
                item.license_urls.append(license)

        # prism
        if idict.has_key('publicationName'):
            item.publication_name = unicode_field(idict,'publicationName', enc)
            item.publication_volume = unicode_field(idict, 'volume', enc)
            item.publication_number = unicode_field(idict, 'number', enc)
            item.publication_section = unicode_field(idict, 'section', enc)
            item.publication_starting_page = unicode_field(idict, 'startingPage', enc)

        parsed.addItem(item)

    parser.reset()

    return parsed

