xoxo-sample-code-python

From Microformats Wiki
Jump to navigation Jump to search

XOXO Sample Code - Python

this is sub-page of xoxo-sample-code

xoxo.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""xoxo.py - a utility module for transforming to and from the XHTMLOutlines format XOXO http://microformats.org/wiki/xoxo
toXOXO takes a Python datastructure (tuples, lists or dictionaries, arbitrarily nested) and returns a XOXO representation of it.
fromXOXO parses an XHTML file for a xoxo list and returns the structure
"""
__version__ = "0.9"
__date__ = "2005-11-02"
__author__ = "Kevin Marks <kmarks@technorati.com>"
__copyright__ = "Copyright 2004-2006, Kevin Marks & Technorati"
__license__ = "http://creativecommons.org/licenses/by/2.0/ CC-by-2.0], [http://www.apache.org/licenses/LICENSE-2.0 Apache 2.0"
__credits__ = """Tantek Çelik and Mark Pilgrim for data structure"""
__history__ = """
TODO: add <title> tag
TODO: add a proper profile link
0.9 smarter parsing for encoding and partial markup; fix dangling dictionary case
0.8 work in unicode then render to utf-8
0.7 initial encoding support - just utf-8 for now
0.6 support the special behavior for url properties  to/from <a>
0.5 fix some awkward side effects of whitespace and text outside our expected tags; simplify writing code
0.4 add correct XHTML headers so it validates
0.3 read/write version; fixed invalid nested list generation;
0.1 first write-only version
"""

try:
    True, False
except NameError:
    True, False = not not 1, not 1
containerTags={'ol':False,'ul':False,'dl':False}
import sgmllib, urllib, urlparse, re,codecs

def toUnicode(key):
    if type(key) == type(u'unicode'):
        uKey= key
    else:
        try: 
            uKey=unicode(key,'utf-8')
        except:
            uKey=unicode(key,'windows_1252')
    return uKey

def makeXOXO(struct,className=None):
    s=u''
    if isinstance(struct,(list,tuple)):
        if className:
            s += u'<ol class="%s">' % className
        else:
            s+= u"<ol>"
        for item in struct:
            s+=u"<li>" + makeXOXO(item,None)+"</li>"
        s +=u"</ol>"
    elif isinstance(struct,dict):
        d=struct.copy()
        if 'url' in d:
            uURL=toUnicode(d['url'])
            s+=u'<a href="%s" ' % uURL
            text =  d.get('text',d.get('title',uURL))
            for attr in ('title','rel','type'):
                if attr in d:
                    xVal = makeXOXO(d[attr],None)
                    s +=u'%s="%s" ' % (attr,xVal)
                    del d[attr]
            s +=u'>%s</a>' % makeXOXO(text,None)
            if 'text' in d:
                del d['text']
            del d['url']
        if len(d):
            s +=u"<dl>"
            for key,value in d.items():
                xVal = makeXOXO(value,None)
                uKey=toUnicode(key)
                s+= u'<dt>%s</dt><dd>%s</dd>' % (uKey, xVal)
            s +=u"</dl>"
    elif type(struct) == type(u'unicode'):
        s+=struct
    else:
        if type(struct)!=type(' '):
            struct=str(struct)
        s += toUnicode(struct)
    return s
class AttrParser(sgmllib.SGMLParser):
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.text=[]
        self.encoding='utf-8'
    def cleanText(self,inText):
        if type(inText) == type(u'unicode'):
            inText = inText.encode(self.encoding,'replace')
        self.text=[]
        self.reset()
        self.feed(inText)
        return ''.join(self.text)
    def setEncoding(self,encoding):
        if 'ascii' in encoding:
            encoding='windows_1252' # so we don't throw an exception on high-bit set chars in there by mistake
        if encoding and encoding !='text/html':
            try:
                canDecode = codecs.getdecoder(encoding)
                self.encoding = encoding
            except:
                try:
                    encoding='japanese.' +encoding
                    canDecode = codecs.getdecoder(encoding)
                    self.encoding = encoding
                except:
                    print "can't deal with encoding %s" % encoding
                    
    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for "©", ref will be "copy"
        # map through to unicode where we can
        try:
            entity =htmlentitydefs.name2codepoint[ref]
            self.handleUnicodeData(unichr(entity))
        except:
            try:
                handle_charref(ref) # deal with char-ref's missing the '#' (see Akma)
            except:
                self.handle_data("&%s" % ref)

    def handle_charref(self, ref):
        # called for each character reference, e.g. for " ", ref will be "160"
        # Reconstruct the original character reference.
        try:
            if ref[0]=='x':
                self.handleUnicodeData(unichr(int(ref[1:],16)))
            else:
                self.handleUnicodeData(unichr(int(ref)))
        except:
            self.handle_data("&#%s" % ref)

# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
    def handle_data(self, text):
        if type(text)==type(u' '):
            self.handleUnicodeData(text)
        if self.encoding== 'utf-8':
            try:
                uText = unicode(text,self.encoding) #utf-8 is pretty clear when it is wrong
            except:
                uText = unicode(text,'windows_1252','ignore') # and this is the likely wrongness
        else:
            uText = unicode(text,self.encoding,'replace') # if they have really broken encoding, (eg lots of shift-JIS blogs)
        self.handleUnicodeData(uText)
    def handleUnicodeData(self, uText):
        self.text.append(uText)
        
class xoxoParser(AttrParser):
    def __init__(self):
        AttrParser.__init__(self)
        self.structs=[]
        self.xostack=[]
        self.textstack=['']
        self.attrparse = AttrParser()
    def normalize_attrs(self, attrs):
        attrs = [(k.lower(), self.attrparse.cleanText(v)) for k, v in attrs]
        attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
        return attrs
    def setEncoding(self,encoding):
        AttrParser.setEncoding(self,encoding)
        self.attrparse.setEncoding(encoding)
    def pushStruct(self,struct):
        if type(struct) == type({}) and len(struct)==0 and len(self.structs) and type(self.structs[-1]) == type({}) and 'url' in self.structs[-1] and self.structs[-1] != self.xostack[-1]:
            self.xostack.append(self.structs[-1]) # put back the <a>-made one for extra def's
        else:
            self.structs.append(struct)
            self.xostack.append(self.structs[-1])
    def do_meta(self, attributes):
        atts = dict(self.normalize_attrs(attributes))
        #print atts.encode('utf-8')
        if 'http-equiv' in atts:
            if atts['http-equiv'].lower() == "content-type":
                if 'content' in atts:
                    encoding = atts['content'].split('charset=')[-1]
                    self.setEncoding(encoding)
    def start_a(self,attrs):
        attrsD = dict(self.normalize_attrs(attrs))
        attrsD['url']= attrsD.get('href','')
        if 'href' in attrsD:
            del attrsD['href']
        self.pushStruct(attrsD)
        self.textstack.append('')
    def end_a(self):
        val = self.textstack.pop()
        if val: 
            if self.xostack[-1].get('title','') == val:
                val=''
            if self.xostack[-1]['url'] == val:
                val=''
            if val:
                self.xostack[-1]['text']=val
        self.xostack.pop()
    def start_dl(self,attrs):
        self.pushStruct({})
    def end_dl(self):
        self.xostack.pop()
    def start_ol(self,attrs):
        self.pushStruct([])
    def end_ol(self):
        self.xostack.pop()
    def start_ul(self,attrs):
        self.pushStruct([])
    def end_ul(self):
        self.xostack.pop()
    def start_li(self,attrs):
        self.textstack.append('')
    def end_li(self):
        val = self.textstack.pop()
        while self.structs[-1] != self.xostack[-1]:
            val = self.structs.pop()
            self.xostack[-1].append(val)
        if type(val) == type(' ') or type(val) == type(u' '):
            self.xostack[-1].append(val)
    def start_dt(self,attrs):
        self.textstack.append('')
    def end_dt(self):
        pass
    def start_dd(self,attrs):
        self.textstack.append('')
    def end_dd(self):
        val = self.textstack.pop()
        key = self.textstack.pop()
        if self.structs[-1] != self.xostack[-1]:
            val = self.structs.pop()
        self.xostack[-1][key]=val
    def handleUnicodeData(self, text):
        if len(self.stack) and containerTags.get(self.stack[-1],True): #skip text not within an element
            self.textstack[-1] += text
def toXOXO(struct,addHTMLWrapper=False,cssUrl=''):
    if type(struct) ==type((1,))or type(struct) ==type([1,]):
        inStruct = struct
    else:
        inStruct = [struct]
    if addHTMLWrapper:
        s= u'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'''
        if cssUrl:
            s+=u'<style type="text/css" >@import "%s";</style>' % cssUrl
        s+=u"</head><body>%s</body></html>" % makeXOXO(inStruct,'xoxo')
        return s.encode('utf-8')
    else:
        return makeXOXO(inStruct,'xoxo').encode('utf-8')
    
def fromXOXO(html):
    parser = xoxoParser()
    #parser.feed(unicode(html,'utf-8'))
    parser.feed(html)
    #print parser.structs
    structs=[struct for struct in parser.structs if struct]
    #print structs
    while len(structs) ==1 and type(structs)==type([1,]):
        structs=structs[0]
    return structs

# Allow direct invocation
# Read HTML from URL, parse into data structures, then re-output

import sys

if __name__ == "__main__":
  if len(sys.argv) < 2: raise SystemExit("Usage: "+sys.argv[0]+" url\n"+__doc__)
  url=sys.argv[1]
  file = urllib.urlopen(url)
  html=file.read(-1)
  file.close
  s=fromXOXO(html)
  p=toXOXO(s,True)
  print p

testxoxo.py

# -*- coding: utf-8 -*-
"""testxoxo.py 
Unit tests for xoxo.py
This file tests the functions in xoxo.py 
The underlying model here is http://diveintopython.org/unit_testing/index.html 

run from command line with
python testxoxo.py -v
"""
import xoxo
reload(xoxo)
import unittest

class xoxoTestCases(unittest.TestCase):
    
    def testSimpleList(self):
        '''make a xoxo file from a list'''
        l = ['1','2','3']
        html = xoxo.toXOXO(l)
        self.assertEqual(html,'<ol class="xoxo"><li>1</li><li>2</li><li>3</li></ol>')
    def testNestedList(self):
        '''make a xoxo file from a list with a list in'''
        l = ['1',['2','3']]
        html = xoxo.toXOXO(l)
        self.assertEqual(html,'<ol class="xoxo"><li>1</li><li><ol><li>2</li><li>3</li></ol></li></ol>')

    def testDictionary(self):
        '''make a xoxo file from a dictionary'''
        d = {'test':'1','name':'Kevin'}
        html = xoxo.toXOXO(d)
        self.assertEqual(html,'<ol class="xoxo"><li><dl><dt>test</dt><dd>1</dd><dt>name</dt><dd>Kevin</dd></dl></li></ol>')

    def testSingleItem(self):
        '''make a xoxo file from a string'''
        l = "test"
        html = xoxo.toXOXO(l)
        self.assertEqual(html,'<ol class="xoxo"><li>test</li></ol>')

    def testWrapDiffers(self):
        '''make a xoxo file from a string with and without html wrapper and check they are different'''
        l = "test"
        html = xoxo.toXOXO(l)
        htmlwrap =  xoxo.toXOXO(l,addHTMLWrapper=True)
        self.failIfEqual(html,htmlwrap)

    def testWrapSingleItem(self):
        '''make a wrapped xoxo file from a string'''
        l = "test"
        html = xoxo.toXOXO(l,addHTMLWrapper=True)
        self.assertEqual(html,'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body><ol class="xoxo"><li>test</li></ol></body></html>''')

    def testWrapItemWithCSS(self):
        '''make a wrapped xoxo file from a string'''
        l = "test"
        html = xoxo.toXOXO(l,addHTMLWrapper=True,cssUrl='reaptest.css')
        self.assertEqual(html,'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><style type="text/css" >@import "reaptest.css";</style></head><body><ol class="xoxo"><li>test</li></ol></body></html>''')

    def testDictionaryRoundTrip(self):
        ''' make a dictionary into a xoxo file and back again; check it is the same'''
        d = {'test':'1','name':'Kevin'}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
        
    def testDictionaryWithURLRoundTrip(self):
        ''' make a dictionary wiht an url in into a xoxo file and back again; check it is the same'''
        d = {'url':'http://example.com','name':'Kevin'}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)    
    def testNestedDictionaryRoundTrip(self):
        ''' make a dictionary with a dict in into a xoxo file and back again; check it is the same'''
        d = {'test':'1','inner':{'name':'Kevin'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
    def testNestedDictionaryWithURLRoundTrip(self):
        ''' make a dictionary with an url and a dict into a xoxo file and back again; check it is the same'''
        d = {'url':'http://example.com','inner':{'name':'Kevin'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
    def testNestedDictionariesWithURLsRoundTrip(self):
        ''' make a dictionary with an url and a dict with an url into a xoxo file and back again; check it is the same'''
        d = {'url':'http://example.com','inner':{'name':'Kevin','url':'http://slashdot.org'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
    def testListRoundTrip(self):
        ''' make a list into a xoxo file and back again; check it is the same'''
        l = ['3','2','1']
        html = xoxo.toXOXO(l)
        newdl= xoxo.fromXOXO(html)
        self.assertEqual(l,newdl)
    def testListofDictsRoundTrip(self):
        ''' make a list of Dicts into a xoxo file and back again; check it is the same'''
        l = ['3',{'a':'2'},{'b':'1','c':'4'}]
        html = xoxo.toXOXO(l)
        newdl= xoxo.fromXOXO(html)
        self.assertEqual(l,newdl)
    def testListofListsRoundTrip(self):
        ''' make a list of Lists into a xoxo file and back again; check it is the same'''
        l = ['3',['a','2'],['b',['1',['c','4']]]]
        html = xoxo.toXOXO(l)
        newdl= xoxo.fromXOXO(html)
        self.assertEqual(l,newdl)
    def testDictofListsRoundTrip(self):
        ''' make a dict with lists in into a xoxo file and back again; check it is the same'''
        d = {'test':['1','2'],
        'name':'Kevin',
        'nestlist':['a',['b','c']],
        'nestdict':{'e':'6','f':'7'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)

    def testXOXOjunkInContainers(self):
        '''make sure text outside <li> etc is ignored'''
        d=xoxo.fromXOXO('<ol>bad<li><dl>worse<dt>good</dt><dd>buy</dd> now</dl></li></ol>')
        self.assertEqual(d,{'good': 'buy'})

    def testXOXOjunkInElements(self):
        '''make sure text within <li> but outside a subcontainer is ignored'''
        l=xoxo.fromXOXO('<ol><li>bad<dl><dt>good</dt><dd>buy</dd></dl>worse</li><li>bag<ol><li>OK</li></ol>fish</li></ol>')
        self.assertEqual(l,[{'good': 'buy'},['OK']])

    def testXOXOWithSpacesAndNewlines(self):
        '''unmung some xoxo with spaces in and check result is right'''
        xoxoSample= '''<ol class='xoxo'> 
  <li>
    <dl>
        <dt>text</dt>
        <dd>item 1</dd>
        <dt>description</dt>
        <dd> This item represents the main point we're trying to make.</dd>
        <dt>url</dt>
        <dd>http://example.com/more.xoxo</dd>
        <dt>title</dt>
        <dd>title of item 1</dd>
        <dt>type</dt>
        <dd>text/xml</dd>
        <dt>rel</dt>
        <dd>help</dd>
    </dl>
  </li>
</ol>'''
        d = xoxo.fromXOXO(xoxoSample)
        d2={'text':'item 1',
            'description':" This item represents the main point we're trying to make.",
            'url':'http://example.com/more.xoxo',
            'title':'title of item 1',
            'type':'text/xml',
            'rel':'help'
            }
        xoxoAgain = xoxo.toXOXO(d)
        self.assertEqual(d,d2)
        #this needs a smarter whitespace-sensitive comparison
        #self.assertEqual(xoxoSample,xoxoAgain)

    def testSpecialAttributeDecoding(self):
        '''unmung some xoxo with <a href=' rel= etc in and check result is right'''
        xoxoSample= '''<ol class='xoxo'> 
  <li>
    <dl>
        <dt>text</dt>
        <dd>item 1</dd>
        <dt>url</dt>
        <dd>http://example.com/more.xoxo</dd>
        <dt>title</dt>
        <dd>title of item 1</dd>
        <dt>type</dt>
        <dd>text/xml</dd>
        <dt>rel</dt>
        <dd>help</dd>
    </dl>
  </li>
</ol>'''
        d = xoxo.fromXOXO(xoxoSample)
        smartxoxoSample= '''<ol class='xoxo'> 
  <li><a href="http://example.com/more.xoxo"
         title="title of item 1"
         type="text/xml"
         rel="help">item 1</a> 
<!-- note how the "text" property is simply the contents of the <a> element -->
  </li>
</ol>'''
        d2 = xoxo.fromXOXO(smartxoxoSample)
        self.assertEqual(d,d2)
    def testSpecialAttributeAndDLDecoding(self):
        '''unmung some xoxo with <a href=' rel= etc in plus a <dl> in the same item and check result is right'''
        xoxoSample= '''<ol class="xoxo"> 
  <li>
    <dl>
        <dt>text</dt>
        <dd>item 1</dd>
        <dt>description</dt>
        <dd> This item represents the main point we're trying to make.</dd>
        <dt>url</dt>
        <dd>http://example.com/more.xoxo</dd>
        <dt>title</dt>
        <dd>title of item 1</dd>
        <dt>type</dt>
        <dd>text/xml</dd>
        <dt>rel</dt>
        <dd>help</dd>
    </dl>
  </li>
</ol>'''
        d = xoxo.fromXOXO(xoxoSample)
        smartxoxoSample= '''<ol class="xoxo"> 
  <li><a href="http://example.com/more.xoxo"
         title="title of item 1"
         type="text/xml"
         rel="help">item 1</a> 
<!-- note how the "text" property is simply the contents of the <a> element -->
      <dl>
        <dt>description</dt>
          <dd> This item represents the main point we're trying to make.</dd>
      </dl>
  </li>
</ol>'''
        d2 = xoxo.fromXOXO(smartxoxoSample)
        self.assertEqual(d,d2)
    def testSpecialAttributeEncode(self):
        '''check it makes an <a href with a url parameter'''
        d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help','text':'an example'}
        html=xoxo.toXOXO(d)
        expectedHTML= '<ol class="xoxo"><li><a href="http://example.com/more.xoxo" title="sample url" rel="help" type="text/xml" >an example</a></li></ol>' 
        self.assertEqual(html,expectedHTML)
        
    def testSpecialAttributeRoundTripFull(self):
        '''check it makes an <a href with a url parameter'''
        d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help','text':'an example'}
        html=xoxo.toXOXO(d)
        self.assertEqual(d,xoxo.fromXOXO(html))
    def testSpecialAttributeRoundTripNoText(self):
        '''check it makes an <a href with a url parameter and no text attribute'''
        d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help'}
        html=xoxo.toXOXO(d)
        self.assertEqual(d,xoxo.fromXOXO(html))
    def testSpecialAttributeRoundTripNoTextOrTitle(self):
        '''check it makes an <a href with a url parameter and no text or title attribute'''
        d={'url':'http://example.com/more.xoxo'}
        html=xoxo.toXOXO(d)
        self.assertEqual(d,xoxo.fromXOXO(html))
    def testAttentionRoundTrip(self):
        '''check nested <a> and <dl> and <a> are preserved'''
        kmattn='''<ol class="xoxo"><li><a href="http://www.boingboing.net/" title="Boing Boing Blog" >Boing Boing Blog</a><dl><dt>alturls</dt><dd><ol><li><a href="http://boingboing.net/rss.xml" >xmlurl</a></li></ol></dd><dt>description</dt><dd>Boing Boing Blog</dd></dl></li><li><a href="http://www.financialcryptography.com/" title="Financial Cryptography" >Financial Cryptography</a><dl><dt>alturls</dt><dd><ol><li><a href="http://www.financialcryptography.com/mt/index.rdf" >xmlurl</a></li></ol></dd><dt>description</dt><dd>Financial Cryptography</dd></dl></li><li><a href="http://hublog.hubmed.org/" title="HubLog" >HubLog</a><dl><dt>alturls</dt><dd><ol><li><a href="http://hublog.hubmed.org/index.xml" >xmlurl</a></li><li><a href="http://hublog.hubmed.org/foaf.rdf" >foafurl</a></li></ol></dd><dt>description</dt><dd>HubLog</dd></dl></li></ol>''';
        d = xoxo.fromXOXO(kmattn)
        newattn = xoxo.toXOXO(d)
        d2 = xoxo.fromXOXO(newattn)
        self.assertEqual(newattn,xoxo.toXOXO(d2))
        self.assertEqual(d,d2)
        self.assertEqual(kmattn,newattn)
        
    def testUnicodeRoundtrip(self):
        '''check unicode characters can go to xoxo and back'''
        src=unicode('Tantek \xc3\x87elik and a snowman \xe2\x98\x83','utf-8')
        html = xoxo.toXOXO(src)
        self.assertEqual(src,xoxo.fromXOXO(html))
    def testUtf8Roundtrip(self):
        '''check utf8 characters can go to xoxo and back'''
        src='Tantek \xc3\x87elik and a snowman \xe2\x98\x83'
        html = xoxo.toXOXO(src)
        self.assertEqual(src,xoxo.fromXOXO(html).encode('utf-8'))
    def testWindows1252Roundtrip(self):
        '''check 1252 characters can go to xoxo and back'''
        src='This is an evil\xa0space'
        html = xoxo.toXOXO(src)
        self.assertEqual(src,xoxo.fromXOXO(html).encode('windows-1252'))
if __name__ == "__main__":
    unittest.main()
else:
    runner = unittest.TextTestRunner()
    suite = unittest.makeSuite(xoxoTestCases,'test')
    runner.run(suite)