#!/usr/bin/env python # http://www.python.org/doc/current/lib/dom-node-objects.html def bookAugment(doc,outfd): import amazon, time def getText(node): node.normalize() try: rc = node.firstChild.data return _normalizeWhitespace(rc) except AttributeError: return None def replaceText(feature,value): for node in feature.childNodes: if node.nodeType == node.TEXT_NODE: feature.removeChild(node) feature.appendChild(doc.createTextNode(value)) def buildQuery(query, property, value): if value not in [None,""]: if query != "": query += " and " query += "%s: %s" %(property, value) return query def _get_feature(property,book): """_get_feature returns a book childNode corresponding to the property. This is extremely clumsy iterating over the books children for every property""" for feature in _childrenElements(book): if feature.localName == property: return feature def _splitTitle(title): try: title,subtitle = getText(ProductName).split(": ",1) except ValueError: title = getText(ProductName) subtitle = "" return title, subtitle def _replaceFeature(replacement,feature): if feature: book.replaceChild(replacement,feature) else: book.appendChild(replacement) _childrenElements = lambda node: [n for n in node.childNodes if n.nodeType == n.ELEMENT_NODE] # is node element _normalizeWhitespace = lambda text: ' '.join(text.split()) _normalizeIsbn = lambda chars: chars.replace('-','') bookcase = doc.getElementsByTagName("bookcase").pop() collection = doc.getElementsByTagName("collection").pop() for book in _childrenElements(collection): query = "" # the query to pass to Amazon searchByPower comments = [] # Build the query from existing title, isbn, and author for feature in _childrenElements(book): if feature.localName == "title": title = getText(feature) query = buildQuery(query, "title", title) if feature.localName == "authors": for author in _childrenElements(feature): author = getText(author) query = buildQuery(query, "author", author) if feature.localName == "isbn": isbn = _normalizeIsbn(getText(feature)) query = buildQuery(query, "isbn", isbn) # Perform the query comments.append("query = %s" %query.encode('utf-8')) try: #results,url = amazon.searchByPower('author:Stephenson and title:Snow Crash') results,url = amazon.searchByPower(query,return_xml=1) comments.append("query='%s'" %url) except amazon.AmazonError, e: comments.append("ERROR %s" %e) for comment in comments: book.appendChild(doc.createComment(comment)) continue time.sleep(.7) # Amazon only permits one query per second # Augment Book with results of query by iterating over # the bookcase DTD and replacing/inserting elements # dbg: I know this algorithm sucks. Details = results.getElementsByTagName("Details") if len(Details) != 1: # not a unique entry comments.append("There is not a single return") for comment in comments: book.appendChild(doc.createComment(comment)) else: Detail = Details[0] book_ns = "http://periapsis.org/bookcase/" book_dtd = ("title", "subtitle", "authors", "isbn") # book_dtd = ("title", "subtitle", "authors", "binding", "pur_date", # "pur_price", "publisher", "edition", "cr_years", "pub_year", # "isbn", "lccn", "pages", "languages", "genres", "keywords", # "series", "series_num", "condition", "signed", "read", "gift", # "loaned", "rating", "comments") ProductName = results.getElementsByTagName("ProductName")[0] title, subtitle = _splitTitle(getText(ProductName).split(": ",1)) for property in book_dtd: # step through the DTD feature = _get_feature(property,book) if property == "title": r_title = doc.createElementNS(book_ns,"title") r_title.appendChild(doc.createTextNode(title)) _replaceFeature(r_title,feature) elif property == "subtitle" and subtitle != "": r_subtitle = doc.createElementNS(book_ns,"subtitle") r_subtitle.appendChild(doc.createTextNode(subtitle)) _replaceFeature(r_subtitle,feature) elif property == "authors": # remove my children and add new ones from Amazon result # dbg: could use dom.importNode(node,True) from cDomlette for author in _childrenElements(feature): feature.removeChild(author) for r_author in Detail.getElementsByTagName("Author"): r_author.tagName = "author" feature.appendChild(r_author) elif property == "isbn": try: # in rare cases there is no ISBN r_isbn = Detail.getElementsByTagName("Isbn")[0] r_isbn.tagName = "isbn" _replaceFeature(r_isbn,feature) except IndexError: continue PrettyPrint(bookcase,outfd) def print_usage(): print "pybookcase infile.xml outfile.xml" print "pybookcase will augment a bookcase XML file with other information from " print "the python interface" if __name__ == "__main__": import getopt, sys mode = 'xml' try: (options,files) = getopt.getopt (sys.argv[1:],"h") except getopt.error: print_usage() for (option,value) in options: pass if option == '-h': print_usage() try: infd = open(files[0]) except IndexError: infd = sys.stdin try: outfd = open(files[1], 'w') except IndexError: outfd = sys.stdout from xml.dom import minidom from xml.dom.ext import PrettyPrint doc = minidom.parse(infd) bookAugment(doc,outfd) infd.close() outfd.close()