xoxo-sample-code-python

From Microformats Wiki
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

XOXO Sample Code - Python

this is sub-page of xoxo-sample-code

xoxo.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""xoxo.py - a utility module for transforming to and from the XHTMLOutlines format XOXO http://microformats.org/wiki/xoxo
toXOXO takes a Python datastructure (tuples, lists or dictionaries, arbitrarily nested) and returns a XOXO representation of it.
fromXOXO parses an XHTML file for a xoxo list and returns the structure
"""
__version__ = "0.9"
__date__ = "2005-11-02"
__author__ = "Kevin Marks <kmarks@technorati.com>"
__copyright__ = "Copyright 2004-2006, Kevin Marks & Technorati"
__license__ = "http://creativecommons.org/licenses/by/2.0/ CC-by-2.0], [http://www.apache.org/licenses/LICENSE-2.0 Apache 2.0"
__credits__ = """Tantek Çelik and Mark Pilgrim for data structure"""
__history__ = """
TODO: add <title> tag
TODO: add a proper profile link
0.9 smarter parsing for encoding and partial markup; fix dangling dictionary case
0.8 work in unicode then render to utf-8
0.7 initial encoding support - just utf-8 for now
0.6 support the special behavior for url properties  to/from <a>
0.5 fix some awkward side effects of whitespace and text outside our expected tags; simplify writing code
0.4 add correct XHTML headers so it validates
0.3 read/write version; fixed invalid nested list generation;
0.1 first write-only version
"""

try:
    True, False
except NameError:
    True, False = not not 1, not 1
containerTags={'ol':False,'ul':False,'dl':False}
import sgmllib, urllib, urlparse, re,codecs

def toUnicode(key):
    if type(key) == type(u'unicode'):
        uKey= key
    else:
        try: 
            uKey=unicode(key,'utf-8')
        except:
            uKey=unicode(key,'windows_1252')
    return uKey

def makeXOXO(struct,className=None):
    s=u''
    if isinstance(struct,(list,tuple)):
        if className:
            s += u'<ol class="%s">' % className
        else:
            s+= u"<ol>"
        for item in struct:
            s+=u"<li>" + makeXOXO(item,None)+"</li>"
        s +=u"</ol>"
    elif isinstance(struct,dict):
        d=struct.copy()
        if 'url' in d:
            uURL=toUnicode(d['url'])
            s+=u'<a href="%s" ' % uURL
            text =  d.get('text',d.get('title',uURL))
            for attr in ('title','rel','type'):
                if attr in d:
                    xVal = makeXOXO(d[attr],None)
                    s +=u'%s="%s" ' % (attr,xVal)
                    del d[attr]
            s +=u'>%s</a>' % makeXOXO(text,None)
            if 'text' in d:
                del d['text']
            del d['url']
        if len(d):
            s +=u"<dl>"
            for key,value in d.items():
                xVal = makeXOXO(value,None)
                uKey=toUnicode(key)
                s+= u'<dt>%s</dt><dd>%s</dd>' % (uKey, xVal)
            s +=u"</dl>"
    elif type(struct) == type(u'unicode'):
        s+=struct
    else:
        if type(struct)!=type(' '):
            struct=str(struct)
        s += toUnicode(struct)
    return s
class AttrParser(sgmllib.SGMLParser):
    def __init__(self):
        sgmllib.SGMLParser.__init__(self)
        self.text=[]
        self.encoding='utf-8'
    def cleanText(self,inText):
        if type(inText) == type(u'unicode'):
            inText = inText.encode(self.encoding,'replace')
        self.text=[]
        self.reset()
        self.feed(inText)
        return ''.join(self.text)
    def setEncoding(self,encoding):
        if 'ascii' in encoding:
            encoding='windows_1252' # so we don't throw an exception on high-bit set chars in there by mistake
        if encoding and encoding !='text/html':
            try:
                canDecode = codecs.getdecoder(encoding)
                self.encoding = encoding
            except:
                try:
                    encoding='japanese.' +encoding
                    canDecode = codecs.getdecoder(encoding)
                    self.encoding = encoding
                except:
                    print "can't deal with encoding %s" % encoding
                    
    def handle_entityref(self, ref):
        # called for each entity reference, e.g. for "©", ref will be "copy"
        # map through to unicode where we can
        try:
            entity =htmlentitydefs.name2codepoint[ref]
            self.handleUnicodeData(unichr(entity))
        except:
            try:
                handle_charref(ref) # deal with char-ref's missing the '#' (see Akma)
            except:
                self.handle_data("&%s" % ref)

    def handle_charref(self, ref):
        # called for each character reference, e.g. for " ", ref will be "160"
        # Reconstruct the original character reference.
        try:
            if ref[0]=='x':
                self.handleUnicodeData(unichr(int(ref[1:],16)))
            else:
                self.handleUnicodeData(unichr(int(ref)))
        except:
            self.handle_data("&#%s" % ref)

# called for each block of plain text, i.e. outside of any tag and
# not containing any character or entity references
    def handle_data(self, text):
        if type(text)==type(u' '):
            self.handleUnicodeData(text)
        if self.encoding== 'utf-8':
            try:
                uText = unicode(text,self.encoding) #utf-8 is pretty clear when it is wrong
            except:
                uText = unicode(text,'windows_1252','ignore') # and this is the likely wrongness
        else:
            uText = unicode(text,self.encoding,'replace') # if they have really broken encoding, (eg lots of shift-JIS blogs)
        self.handleUnicodeData(uText)
    def handleUnicodeData(self, uText):
        self.text.append(uText)
        
class xoxoParser(AttrParser):
    def __init__(self):
        AttrParser.__init__(self)
        self.structs=[]
        self.xostack=[]
        self.textstack=['']
        self.attrparse = AttrParser()
    def normalize_attrs(self, attrs):
        attrs = [(k.lower(), self.attrparse.cleanText(v)) for k, v in attrs]
        attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
        return attrs
    def setEncoding(self,encoding):
        AttrParser.setEncoding(self,encoding)
        self.attrparse.setEncoding(encoding)
    def pushStruct(self,struct):
        if type(struct) == type({}) and len(struct)==0 and len(self.structs) and type(self.structs[-1]) == type({}) and 'url' in self.structs[-1] and self.structs[-1] != self.xostack[-1]:
            self.xostack.append(self.structs[-1]) # put back the <a>-made one for extra def's
        else:
            self.structs.append(struct)
            self.xostack.append(self.structs[-1])
    def do_meta(self, attributes):
        atts = dict(self.normalize_attrs(attributes))
        #print atts.encode('utf-8')
        if 'http-equiv' in atts:
            if atts['http-equiv'].lower() == "content-type":
                if 'content' in atts:
                    encoding = atts['content'].split('charset=')[-1]
                    self.setEncoding(encoding)
    def start_a(self,attrs):
        attrsD = dict(self.normalize_attrs(attrs))
        attrsD['url']= attrsD.get('href','')
        if 'href' in attrsD:
            del attrsD['href']
        self.pushStruct(attrsD)
        self.textstack.append('')
    def end_a(self):
        val = self.textstack.pop()
        if val: 
            if self.xostack[-1].get('title','') == val:
                val=''
            if self.xostack[-1]['url'] == val:
                val=''
            if val:
                self.xostack[-1]['text']=val
        self.xostack.pop()
    def start_dl(self,attrs):
        self.pushStruct({})
    def end_dl(self):
        self.xostack.pop()
    def start_ol(self,attrs):
        self.pushStruct([])
    def end_ol(self):
        self.xostack.pop()
    def start_ul(self,attrs):
        self.pushStruct([])
    def end_ul(self):
        self.xostack.pop()
    def start_li(self,attrs):
        self.textstack.append('')
    def end_li(self):
        val = self.textstack.pop()
        while self.structs[-1] != self.xostack[-1]:
            val = self.structs.pop()
            self.xostack[-1].append(val)
        if type(val) == type(' ') or type(val) == type(u' '):
            self.xostack[-1].append(val)
    def start_dt(self,attrs):
        self.textstack.append('')
    def end_dt(self):
        pass
    def start_dd(self,attrs):
        self.textstack.append('')
    def end_dd(self):
        val = self.textstack.pop()
        key = self.textstack.pop()
        if self.structs[-1] != self.xostack[-1]:
            val = self.structs.pop()
        self.xostack[-1][key]=val
    def handleUnicodeData(self, text):
        if len(self.stack) and containerTags.get(self.stack[-1],True): #skip text not within an element
            self.textstack[-1] += text
def toXOXO(struct,addHTMLWrapper=False,cssUrl=''):
    if type(struct) ==type((1,))or type(struct) ==type([1,]):
        inStruct = struct
    else:
        inStruct = [struct]
    if addHTMLWrapper:
        s= u'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'''
        if cssUrl:
            s+=u'<style type="text/css" >@import "%s";</style>' % cssUrl
        s+=u"</head><body>%s</body></html>" % makeXOXO(inStruct,'xoxo')
        return s.encode('utf-8')
    else:
        return makeXOXO(inStruct,'xoxo').encode('utf-8')
    
def fromXOXO(html):
    parser = xoxoParser()
    #parser.feed(unicode(html,'utf-8'))
    parser.feed(html)
    #print parser.structs
    structs=[struct for struct in parser.structs if struct]
    #print structs
    while len(structs) ==1 and type(structs)==type([1,]):
        structs=structs[0]
    return structs

# Allow direct invocation
# Read HTML from URL, parse into data structures, then re-output

import sys

if __name__ == "__main__":
  if len(sys.argv) < 2: raise SystemExit("Usage: "+sys.argv[0]+" url\n"+__doc__)
  url=sys.argv[1]
  file = urllib.urlopen(url)
  html=file.read(-1)
  file.close
  s=fromXOXO(html)
  p=toXOXO(s,True)
  print p

testxoxo.py

# -*- coding: utf-8 -*-
"""testxoxo.py 
Unit tests for xoxo.py
This file tests the functions in xoxo.py 
The underlying model here is http://diveintopython.org/unit_testing/index.html 

run from command line with
python testxoxo.py -v
"""
import xoxo
reload(xoxo)
import unittest

class xoxoTestCases(unittest.TestCase):
    
    def testSimpleList(self):
        '''make a xoxo file from a list'''
        l = ['1','2','3']
        html = xoxo.toXOXO(l)
        self.assertEqual(html,'<ol class="xoxo"><li>1</li><li>2</li><li>3</li></ol>')
    def testNestedList(self):
        '''make a xoxo file from a list with a list in'''
        l = ['1',['2','3']]
        html = xoxo.toXOXO(l)
        self.assertEqual(html,'<ol class="xoxo"><li>1</li><li><ol><li>2</li><li>3</li></ol></li></ol>')

    def testDictionary(self):
        '''make a xoxo file from a dictionary'''
        d = {'test':'1','name':'Kevin'}
        html = xoxo.toXOXO(d)
        self.assertEqual(html,'<ol class="xoxo"><li><dl><dt>test</dt><dd>1</dd><dt>name</dt><dd>Kevin</dd></dl></li></ol>')

    def testSingleItem(self):
        '''make a xoxo file from a string'''
        l = "test"
        html = xoxo.toXOXO(l)
        self.assertEqual(html,'<ol class="xoxo"><li>test</li></ol>')

    def testWrapDiffers(self):
        '''make a xoxo file from a string with and without html wrapper and check they are different'''
        l = "test"
        html = xoxo.toXOXO(l)
        htmlwrap =  xoxo.toXOXO(l,addHTMLWrapper=True)
        self.failIfEqual(html,htmlwrap)

    def testWrapSingleItem(self):
        '''make a wrapped xoxo file from a string'''
        l = "test"
        html = xoxo.toXOXO(l,addHTMLWrapper=True)
        self.assertEqual(html,'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body><ol class="xoxo"><li>test</li></ol></body></html>''')

    def testWrapItemWithCSS(self):
        '''make a wrapped xoxo file from a string'''
        l = "test"
        html = xoxo.toXOXO(l,addHTMLWrapper=True,cssUrl='reaptest.css')
        self.assertEqual(html,'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><style type="text/css" >@import "reaptest.css";</style></head><body><ol class="xoxo"><li>test</li></ol></body></html>''')

    def testDictionaryRoundTrip(self):
        ''' make a dictionary into a xoxo file and back again; check it is the same'''
        d = {'test':'1','name':'Kevin'}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
        
    def testDictionaryWithURLRoundTrip(self):
        ''' make a dictionary wiht an url in into a xoxo file and back again; check it is the same'''
        d = {'url':'http://example.com','name':'Kevin'}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)    
    def testNestedDictionaryRoundTrip(self):
        ''' make a dictionary with a dict in into a xoxo file and back again; check it is the same'''
        d = {'test':'1','inner':{'name':'Kevin'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
    def testNestedDictionaryWithURLRoundTrip(self):
        ''' make a dictionary with an url and a dict into a xoxo file and back again; check it is the same'''
        d = {'url':'http://example.com','inner':{'name':'Kevin'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
    def testNestedDictionariesWithURLsRoundTrip(self):
        ''' make a dictionary with an url and a dict with an url into a xoxo file and back again; check it is the same'''
        d = {'url':'http://example.com','inner':{'name':'Kevin','url':'http://slashdot.org'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)
    def testListRoundTrip(self):
        ''' make a list into a xoxo file and back again; check it is the same'''
        l = ['3','2','1']
        html = xoxo.toXOXO(l)
        newdl= xoxo.fromXOXO(html)
        self.assertEqual(l,newdl)
    def testListofDictsRoundTrip(self):
        ''' make a list of Dicts into a xoxo file and back again; check it is the same'''
        l = ['3',{'a':'2'},{'b':'1','c':'4'}]
        html = xoxo.toXOXO(l)
        newdl= xoxo.fromXOXO(html)
        self.assertEqual(l,newdl)
    def testListofListsRoundTrip(self):
        ''' make a list of Lists into a xoxo file and back again; check it is the same'''
        l = ['3',['a','2'],['b',['1',['c','4']]]]
        html = xoxo.toXOXO(l)
        newdl= xoxo.fromXOXO(html)
        self.assertEqual(l,newdl)
    def testDictofListsRoundTrip(self):
        ''' make a dict with lists in into a xoxo file and back again; check it is the same'''
        d = {'test':['1','2'],
        'name':'Kevin',
        'nestlist':['a',['b','c']],
        'nestdict':{'e':'6','f':'7'}}
        html = xoxo.toXOXO(d)
        newd = xoxo.fromXOXO(html)
        self.assertEqual(d,newd)

    def testXOXOjunkInContainers(self):
        '''make sure text outside <li> etc is ignored'''
        d=xoxo.fromXOXO('<ol>bad<li><dl>worse<dt>good</dt><dd>buy</dd> now</dl></li></ol>')
        self.assertEqual(d,{'good': 'buy'})

    def testXOXOjunkInElements(self):
        '''make sure text within <li> but outside a subcontainer is ignored'''
        l=xoxo.fromXOXO('<ol><li>bad<dl><dt>good</dt><dd>buy</dd></dl>worse</li><li>bag<ol><li>OK</li></ol>fish</li></ol>')
        self.assertEqual(l,[{'good': 'buy'},['OK']])

    def testXOXOWithSpacesAndNewlines(self):
        '''unmung some xoxo with spaces in and check result is right'''
        xoxoSample= '''<ol class='xoxo'> 
  <li>
    <dl>
        <dt>text</dt>
        <dd>item 1</dd>
        <dt>description</dt>
        <dd> This item represents the main point we're trying to make.</dd>
        <dt>url</dt>
        <dd>http://example.com/more.xoxo</dd>
        <dt>title</dt>
        <dd>title of item 1</dd>
        <dt>type</dt>
        <dd>text/xml</dd>
        <dt>rel</dt>
        <dd>help</dd>
    </dl>
  </li>
</ol>'''
        d = xoxo.fromXOXO(xoxoSample)
        d2={'text':'item 1',
            'description':" This item represents the main point we're trying to make.",
            'url':'http://example.com/more.xoxo',
            'title':'title of item 1',
            'type':'text/xml',
            'rel':'help'
            }
        xoxoAgain = xoxo.toXOXO(d)
        self.assertEqual(d,d2)
        #this needs a smarter whitespace-sensitive comparison
        #self.assertEqual(xoxoSample,xoxoAgain)

    def testSpecialAttributeDecoding(self):
        '''unmung some xoxo with <a href=' rel= etc in and check result is right'''
        xoxoSample= '''<ol class='xoxo'> 
  <li>
    <dl>
        <dt>text</dt>
        <dd>item 1</dd>
        <dt>url</dt>
        <dd>http://example.com/more.xoxo</dd>
        <dt>title</dt>
        <dd>title of item 1</dd>
        <dt>type</dt>
        <dd>text/xml</dd>
        <dt>rel</dt>
        <dd>help</dd>
    </dl>
  </li>
</ol>'''
        d = xoxo.fromXOXO(xoxoSample)
        smartxoxoSample= '''<ol class='xoxo'> 
  <li><a href="http://example.com/more.xoxo"
         title="title of item 1"
         type="text/xml"
         rel="help">item 1</a> 
<!-- note how the "text" property is simply the contents of the <a> element -->
  </li>
</ol>'''
        d2 = xoxo.fromXOXO(smartxoxoSample)
        self.assertEqual(d,d2)
    def testSpecialAttributeAndDLDecoding(self):
        '''unmung some xoxo with <a href=' rel= etc in plus a <dl> in the same item and check result is right'''
        xoxoSample= '''<ol class="xoxo"> 
  <li>
    <dl>
        <dt>text</dt>
        <dd>item 1</dd>
        <dt>description</dt>
        <dd> This item represents the main point we're trying to make.</dd>
        <dt>url</dt>
        <dd>http://example.com/more.xoxo</dd>
        <dt>title</dt>
        <dd>title of item 1</dd>
        <dt>type</dt>
        <dd>text/xml</dd>
        <dt>rel</dt>
        <dd>help</dd>
    </dl>
  </li>
</ol>'''
        d = xoxo.fromXOXO(xoxoSample)
        smartxoxoSample= '''<ol class="xoxo"> 
  <li><a href="http://example.com/more.xoxo"
         title="title of item 1"
         type="text/xml"
         rel="help">item 1</a> 
<!-- note how the "text" property is simply the contents of the <a> element -->
      <dl>
        <dt>description</dt>
          <dd> This item represents the main point we're trying to make.</dd>
      </dl>
  </li>
</ol>'''
        d2 = xoxo.fromXOXO(smartxoxoSample)
        self.assertEqual(d,d2)
    def testSpecialAttributeEncode(self):
        '''check it makes an <a href with a url parameter'''
        d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help','text':'an example'}
        html=xoxo.toXOXO(d)
        expectedHTML= '<ol class="xoxo"><li><a href="http://example.com/more.xoxo" title="sample url" rel="help" type="text/xml" >an example</a></li></ol>' 
        self.assertEqual(html,expectedHTML)
        
    def testSpecialAttributeRoundTripFull(self):
        '''check it makes an <a href with a url parameter'''
        d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help','text':'an example'}
        html=xoxo.toXOXO(d)
        self.assertEqual(d,xoxo.fromXOXO(html))
    def testSpecialAttributeRoundTripNoText(self):
        '''check it makes an <a href with a url parameter and no text attribute'''
        d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help'}
        html=xoxo.toXOXO(d)
        self.assertEqual(d,xoxo.fromXOXO(html))
    def testSpecialAttributeRoundTripNoTextOrTitle(self):
        '''check it makes an <a href with a url parameter and no text or title attribute'''
        d={'url':'http://example.com/more.xoxo'}
        html=xoxo.toXOXO(d)
        self.assertEqual(d,xoxo.fromXOXO(html))
    def testAttentionRoundTrip(self):
        '''check nested <a> and <dl> and <a> are preserved'''
        kmattn='''<ol class="xoxo"><li><a href="http://www.boingboing.net/" title="Boing Boing Blog" >Boing Boing Blog</a><dl><dt>alturls</dt><dd><ol><li><a href="http://boingboing.net/rss.xml" >xmlurl</a></li></ol></dd><dt>description</dt><dd>Boing Boing Blog</dd></dl></li><li><a href="http://www.financialcryptography.com/" title="Financial Cryptography" >Financial Cryptography</a><dl><dt>alturls</dt><dd><ol><li><a href="http://www.financialcryptography.com/mt/index.rdf" >xmlurl</a></li></ol></dd><dt>description</dt><dd>Financial Cryptography</dd></dl></li><li><a href="http://hublog.hubmed.org/" title="HubLog" >HubLog</a><dl><dt>alturls</dt><dd><ol><li><a href="http://hublog.hubmed.org/index.xml" >xmlurl</a></li><li><a href="http://hublog.hubmed.org/foaf.rdf" >foafurl</a></li></ol></dd><dt>description</dt><dd>HubLog</dd></dl></li></ol>''';
        d = xoxo.fromXOXO(kmattn)
        newattn = xoxo.toXOXO(d)
        d2 = xoxo.fromXOXO(newattn)
        self.assertEqual(newattn,xoxo.toXOXO(d2))
        self.assertEqual(d,d2)
        self.assertEqual(kmattn,newattn)
        
    def testUnicodeRoundtrip(self):
        '''check unicode characters can go to xoxo and back'''
        src=unicode('Tantek \xc3\x87elik and a snowman \xe2\x98\x83','utf-8')
        html = xoxo.toXOXO(src)
        self.assertEqual(src,xoxo.fromXOXO(html))
    def testUtf8Roundtrip(self):
        '''check utf8 characters can go to xoxo and back'''
        src='Tantek \xc3\x87elik and a snowman \xe2\x98\x83'
        html = xoxo.toXOXO(src)
        self.assertEqual(src,xoxo.fromXOXO(html).encode('utf-8'))
    def testWindows1252Roundtrip(self):
        '''check 1252 characters can go to xoxo and back'''
        src='This is an evil\xa0space'
        html = xoxo.toXOXO(src)
        self.assertEqual(src,xoxo.fromXOXO(html).encode('windows-1252'))
if __name__ == "__main__":
    unittest.main()
else:
    runner = unittest.TextTestRunner()
    suite = unittest.makeSuite(xoxoTestCases,'test')
    runner.run(suite)