#!/bin/env python
"""movieIndex.py -- look up movies from barcode scans
e.g. by cuecat on amazon
"""

__version__ = "$Id: movieIndex.py,v 1.5 2005/01/02 06:49:41 connolly Exp $"

import urllib2, urllib
import sys

#########

def main(argv):
    print "<ol>"
    movies = []
    for code in sys.stdin:
        diag("starting", code)
        code = code.split()[0]
        upcLink = inUPCdb(code)
        markup = urllib2.urlopen(upcLink).read()
        about = tableDict(markup)
        ttl = about['Description']
        if ttl.startswith("(*) "): ttl = ttl[4:] # skip (*)
        if ttl.endswith(" (Widescreen Edition)"):
            ttl = ttl[:-len(" (Widescreen Edition")]
        imdbLink = atIMDB(ttl)

        markup = urllib2.urlopen(imdbLink).read()
        #diag("IMDB markup:", markup)

        try:
            # exact match?
            data = getTitleYear(markup)
        except StopIteration:
            # no...search results page
            try:
                data = findTitleYear(markup)
            except StopIteration:
                # didn't find it there either
                diag("not found:", code, ttl)
                continue

        ttl = data[:data.find("(")].strip()
        yr = data[data.find("(")+1:data.find(")")]

        imdbLink = movieHome(markup)
        movies.append((yr, ttl, code, upcLink, imdbLink))
        diag("done with", code, yr, ttl)

    movies.sort()
    for yr, ttl, code, upcLink, imdbLink in movies:
        print "<li><cite><a href='%s'>%s</a></cite> (%s)<br /><tt><small><a href='%s'>%s</a></small></tt></li>" % (imdbLink, ttl, yr, upcLink, code)
    print "</ol>"



def tags(markup):
    """quick-n-dirty XML parser
    """
    for tag in markup.split(">"):
        yield tag


def attr(n, tag):
    """attr

    >>> attr('content', '<meta name="description" content="Parenthood (1989): find the latest new')
    'Parenthood (1989): find the latest new'

    >>> attr('class', '<strong class="title"')
    'title'
    
    """

    n = n + "=\""
    s = tag.find(n)
    if s >= 0:
        val = tag[s+len(n):]
        if '"' in val:
            val = val[:val.find('"')]
        return val
    return None


def skipTo(ts, t, atr=None, val=None):
    """

    >>> skipTo(tags("<li>abc</li>"), "</li")
    'abc'

    >>> skipTo(tags('<li>abc</li>def<strong class="title">'), "<strong", "class", "title")
    'abcdef'
    """

    data = ''
    while 1:
        tag = ts.next()
        #diag("looking for", t, " :", tag)
        delim = tag.find("<")
        data += tag[:delim]
        tag = tag[delim:]
        #diag("after split", t, " :", tag)
        if tag.startswith(t):
            if atr and not(attr(atr, tag) == val): continue
            return data
    else:
        raise ValueError



def tableDict(markup):
    ts = tags(markup)
    for tag in ts:
        if tag.strip().startswith("<table"):
            break
    else:
        raise ValueError

    d = {}
    row = []
    while 1:
        t = ts.next()
        if t.endswith("</table"):
            return d
        if t.endswith("</td"):
            data = t[:t.find("<")]
            row.append(data)
        if t.endswith("</tr"):
            if len(row) == 3:
                d[row[0]] = row[2]
            row = []
            
 
 
def getTitleYear(markup):
    """parse title and year from IMDB page about a movie
    
    >>> getTitleYear('<h1><strong class="title">Jerry Maguire <small>(<a href="/Sections/Years/1996">1996</a>)</small></strong></h1>')
    'Jerry Maguire (1996)'

    raises StopIteration if it's not there
    raises ValueError if the markup is goofy
    """
    
    ts = tags(markup)
    skipTo(ts, '<strong', 'class', 'title')
    return skipTo(ts, '</strong')

        
def findTitleYear(markup):
    ts = tags(markup)
    skipTo(ts, "<ol")

    out = ''

    skipTo(ts, "<li")
    return skipTo(ts, "</li")


def movieHome(markup):
    """
    >>> movieHome('.... /tt0116695/ ...')
    'http://www.imdb.com/title/tt0116695/'
    
    """

    i = markup.find("/tt")
    j = markup.find("/", i+1)
    return 'http://www.imdb.com/title/%s/' % markup[i+1:j]


def amzTitleYear(markup):
    for tag in markup.split(">"):
        #print "found tag:", tag
        if tag.strip().startswith("<meta"):
            v = attr('content', tag)
            ttl = v[:v.find("(")]
            yr = v[v.find("(")+1:v.find(")")]
            return ttl, yr


def atIMDB(ttl):
    # oddly, "Rudy" comes up higher with nm=on
    if ttl == "Rudy": extra = 'nm=on;'
    else: extra = ''
    
    return 'http://www.imdb.com/find?q=%s;tt=on;%smx=20' % (urllib.quote(ttl), extra)


def atMovieFinder(code):
    return 'http://www.moviefinder4u.com/movie_detail/%s.html' % code


def atTampoo(code):
    return 'http://www.tampoo.com/item.php?upc=%s&category_id=Video' % code

def atYahoo(code):
    #@@ &id=1800104808
    return 'http://movies.yahoo.com/shop?d=hv&cf=dvdinfo&upc=%s' % code

def inUPCdb(code):
    return 'http://www.upcdatabase.com/item.pl?upc=%s' % code

def atAmazon(code):
    return 'http://www.amazon.com/exec/obidos/search-handle-url/index=vhs&field-keywords=%s&search-type=ss&bq=1&store-name=video/ref=xs_ap_l_xgl27/102-9298041-1208141' % code

#########

def diag(*args):
    print >>sys.stderr, args


##########
 
def _test():
    import doctest
    doctest.testmod()

if __name__ == "__main__":
    if sys.argv[1:] and sys.argv[1] == '--test':
        _test()
        sys.exit()
    main(sys.argv)

