xoxo-sample-code: Difference between revisions
Jump to navigation
Jump to search
(→xoxo.py: added CLI invocation) |
|||
| Line 156: | Line 156: | ||
return makeXOXO(inStruct,'xoxo').encode('utf-8') | return makeXOXO(inStruct,'xoxo').encode('utf-8') | ||
import sys | |||
def fromXOXO(html): | def fromXOXO(html): | ||
parser = xoxoParser() | parser = xoxoParser() | ||
parser.feed(unicode(html,'utf-8')) | parser.feed(unicode(html,'utf-8')) | ||
#print parser.structs | #print >>sys.stderr, parser.structs | ||
structs=[struct for struct in parser.structs if struct] | structs=[struct for struct in parser.structs if struct] | ||
#print structs | #print >>sys.stderr, structs | ||
while (len(structs) ==1 and type(structs)==type([1,])): | while (len(structs) ==1 and type(structs)==type([1,])): | ||
structs=structs[0] | structs=structs[0] | ||
return structs | return structs | ||
# Allow direct invocation | |||
# Read HTML from URL, parse into data structures, then re-output | |||
if __name__ == "__main__": | |||
if len(sys.argv) < 2: raise SystemExit("Usage: "+sys.argv[0]+" url\n"+__doc__) | |||
url=sys.argv[1] | |||
file = urllib.urlopen(url) | |||
html=file.read(-1) | |||
file.close | |||
s=fromXOXO(html) | |||
p=toXOXO(s,True) | |||
print p | |||
</nowiki></pre> | </nowiki></pre> | ||
Revision as of 22:02, 19 October 2005
XOXO Sample Code
A whole bunch of open source (CC-by-2.0, Apache 2.0) sample code to read and write xoxo files in Python (with Perl, PHP, ... to follow).
xoxo.py
# -*- coding: utf-8 -*-
"""xoxo.py - a utility module for transforming to and from the XHTMLOutlines format XOXO
toXOXO takes a Python datastructure (tuples, lists or dictionaries, arbitrarily nested) and returns a XOXO representation of it.
fromXOXO parses an XHTML file for a xoxo list and returns the structure
"""
__version__ = "0.8"
__date__ = "2004-10-05"
__author__ = "Kevin Marks <kmarks@technorati.com>"
__copyright__ = "Copyright 2004, Kevin marks & Technorati"
__license__ = "http://creativecommons.org/licenses/by/2.0/ CC-by-2.0], [http://www.apache.org/licenses/LICENSE-2.0 Apache 2.0"
__credits__ = """Tantek Çelik and Mark Pilgrim for data structure"""
__history__ = """
TODO: add <title> tag
TODO: add a proper profile link
0.8 work in unicode then render to utf-8
0.7 initial encoding support - just utf-8 for now
0.6 support the special behaviour for url properties to/from <a>
0.5 fix some awkward side effects of whitespace and text outside our expected tags; simplify writing code
0.4 add correct XHTML headers so it validates
0.3 read/write version; fixed invlaid nested list generation;
0.1 first write-only version
"""
try:
True, False
except NameError:
True, False = not not 1, not 1
containerTags={'ol':False,'ul':False,'dl':False}
import sgmllib, urllib, urlparse, re
def makeXOXO(struct,className=None,depth=0):
s=u''
if isinstance(struct,list) or isinstance(struct,tuple):
if className:
s += u'<ol class="%s">' % className
else:
s+= u"<ol>"
if isinstance(struct,dict):
d=struct.copy()
if d.has_key('url'):
s+=u'<a href="%s" ' % d['url']
text = d.get('text',d.get('title',d['url']))
for attr in ('title','rel','type'):
if d.has_key(attr):
xVal = makeXOXO(d[attr],None,depth+1)
s +=u'%s="%s" ' % (attr,xVal)
del d[attr]
s +=u'>%s</a>' % makeXOXO(text,None,depth+1)
if d.has_key('text'):
del d['text']
del d['url']
if len(d):
s +=u"<dl>"
for key,value in d.items():
xVal = makeXOXO(value,None,depth+1)
s+= u'<dt>%s</dt><dd>%s</dd>' % (key, xVal)
s +=u"</dl>"
elif type(struct) ==type((1,))or type(struct) ==type([1,]):
for item in struct:
s+=u"<li>" + makeXOXO(item,None,depth+1)+"</li>"
s +=u"</ol>"
elif type(struct) == type(u'unicode'):
s+=struct
else:
if not type(struct)==type(' '):
struct=str(struct)
try:
s+=unicode(struct,'utf-8')
except:
s+=unicode(struct,'windows_1252')
return s
class xoxoParser(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.structs=[]
self.xostack=[]
self.textstack=['']
def normalize_attrs(self, attrs):
attrs = [(k.lower(), sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v).strip()) for k, v in attrs]
attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
return attrs
def pushStruct(self,struct):
if type(struct) == type({}) and len(struct)==0 and len(self.structs) and type(self.structs[-1]) == type({}) and self.structs[-1].has_key('url'):
self.xostack.append(self.structs[-1]) # put back the <a>-made one for extra def's
else:
self.structs.append(struct)
self.xostack.append(self.structs[-1])
def start_a(self,attrs):
attrsD = dict(self.normalize_attrs(attrs))
attrsD['url']= attrsD.get('href','')
del attrsD['href']
self.pushStruct(attrsD)
self.textstack.append('')
def end_a(self):
val = self.textstack.pop()
if val:
if self.xostack[-1].get('title','') == val:
val=''
if self.xostack[-1]['url'] == val:
val=''
if val:
self.xostack[-1]['text']=val
self.xostack.pop()
def start_dl(self,attrs):
self.pushStruct({})
def end_dl(self):
self.xostack.pop()
def start_ol(self,attrs):
self.pushStruct([])
def end_ol(self):
self.xostack.pop()
def start_ul(self,attrs):
self.pushStruct([])
def end_ul(self):
self.xostack.pop()
def start_li(self,attrs):
self.textstack.append('')
def end_li(self):
val = self.textstack.pop()
if self.structs[-1] != self.xostack[-1]:
val = self.structs.pop()
self.xostack[-1].append(val)
def start_dt(self,attrs):
self.textstack.append('')
def end_dt(self):
pass
def start_dd(self,attrs):
self.textstack.append('')
def end_dd(self):
val = self.textstack.pop()
key = self.textstack.pop()
if self.structs[-1] != self.xostack[-1]:
val = self.structs.pop()
self.xostack[-1][key]=val
def handle_data(self, text):
if len(self.stack) and containerTags.get(self.stack[-1],True): #skip text not within an element
self.textstack[-1] += text
def toXOXO(struct,addHTMLWrapper=False,cssUrl=''):
if type(struct) ==type((1,))or type(struct) ==type([1,]):
inStruct = struct
else:
inStruct = [struct]
if addHTMLWrapper:
s= '''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'''
if cssUrl:
s+='<style type="text/css" >@import "%s";</style>' % cssUrl
s+="</head><body>%s</body></html>" % makeXOXO(inStruct,'xoxo')
return s.encode('utf-8')
else:
return makeXOXO(inStruct,'xoxo').encode('utf-8')
import sys
def fromXOXO(html):
parser = xoxoParser()
parser.feed(unicode(html,'utf-8'))
#print >>sys.stderr, parser.structs
structs=[struct for struct in parser.structs if struct]
#print >>sys.stderr, structs
while (len(structs) ==1 and type(structs)==type([1,])):
structs=structs[0]
return structs
# Allow direct invocation
# Read HTML from URL, parse into data structures, then re-output
if __name__ == "__main__":
if len(sys.argv) < 2: raise SystemExit("Usage: "+sys.argv[0]+" url\n"+__doc__)
url=sys.argv[1]
file = urllib.urlopen(url)
html=file.read(-1)
file.close
s=fromXOXO(html)
p=toXOXO(s,True)
print p
testxoxo.py
# -*- coding: utf-8 -*-
"""testxoxo.py
Unit tests for xoxo.py
This file tests the functions in xoxo.py
The underlying model here is http://diveintopython.org/unit_testing/index.html
run from command line with
python testxoxo.py -v
"""
import xoxo
reload(xoxo)
import unittest
class xoxoTestCases(unittest.TestCase):
def testSimpleList(self):
'''make a xoxo file from a list'''
l = ['1','2','3']
html = xoxo.toXOXO(l)
self.assertEqual(html,'<ol class="xoxo"><li>1</li><li>2</li><li>3</li></ol>')
def testNestedList(self):
'''make a xoxo file from a list with a list in'''
l = ['1',['2','3']]
html = xoxo.toXOXO(l)
self.assertEqual(html,'<ol class="xoxo"><li>1</li><li><ol><li>2</li><li>3</li></ol></li></ol>')
def testDictionary(self):
'''make a xoxo file from a dictionary'''
d = {'test':'1','name':'Kevin'}
html = xoxo.toXOXO(d)
self.assertEqual(html,'<ol class="xoxo"><li><dl><dt>test</dt><dd>1</dd><dt>name</dt><dd>Kevin</dd></dl></li></ol>')
def testSingleItem(self):
'''make a xoxo file from a string'''
l = "test"
html = xoxo.toXOXO(l)
self.assertEqual(html,'<ol class="xoxo"><li>test</li></ol>')
def testWrapDiffers(self):
'''make a xoxo file from a string with and without html wrapper and check they are different'''
l = "test"
html = xoxo.toXOXO(l)
htmlwrap = xoxo.toXOXO(l,addHTMLWrapper=True)
self.failIfEqual(html,htmlwrap)
def testWrapSingleItem(self):
'''make a wrapped xoxo file from a string'''
l = "test"
html = xoxo.toXOXO(l,addHTMLWrapper=True)
self.assertEqual(html,'''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN
http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head profile=""></head><body><ol class="xoxo"><li>test</li></ol></body></html>''')
def testDictionaryRoundTrip(self):
''' make a dictionary into a xoxo file and back again; check it is the same'''
d = {'test':'1','name':'Kevin'}
html = xoxo.toXOXO(d)
newd = xoxo.fromXOXO(html)
self.assertEqual(d,newd)
def testListRoundTrip(self):
''' make a list into a xoxo file and back again; check it is the same'''
l = ['3','2','1']
html = xoxo.toXOXO(l)
newdl= xoxo.fromXOXO(html)
self.assertEqual(l,newdl)
def testListofDictsRoundTrip(self):
''' make a list of Dicts into a xoxo file and back again; check it is the same'''
l = ['3',{'a':'2'},{'b':'1','c':'4'}]
html = xoxo.toXOXO(l)
newdl= xoxo.fromXOXO(html)
self.assertEqual(l,newdl)
def testListofListsRoundTrip(self):
''' make a list of Lists into a xoxo file and back again; check it is the same'''
l = ['3',['a','2'],['b',['1',['c','4']]]]
html = xoxo.toXOXO(l)
newdl= xoxo.fromXOXO(html)
self.assertEqual(l,newdl)
def testDictofListsRoundTrip(self):
''' make a dict with lists in into a xoxo file and back again; check it is the same'''
d = {'test':['1','2'],
'name':'Kevin',
'nestlist':['a',['b','c']],
'nestdict':{'e':'6','f':'7'}}
html = xoxo.toXOXO(d)
newd = xoxo.fromXOXO(html)
self.assertEqual(d,newd)
def testXOXOjunkInContainers(self):
'''make sure text outside <li> etc is ignored'''
d=xoxo.fromXOXO('<ol>bad<li><dl>worse<dt>good</dt><dd>buy</dd> now</dl></li></ol>')
self.assertEqual(d,{'good': 'buy'})
def testXOXOjunkInElements(self):
'''make sure text within <li> but outside a subcontainer is ignored'''
l=xoxo.fromXOXO('<ol><li>bad<dl><dt>good</dt><dd>buy</dd></dl>worse</li><li>bag<ol><li>OK</li></ol>fish</li></ol>')
self.assertEqual(l,[{'good': 'buy'},['OK']])
def testXOXOWithSpacesAndNewlines(self):
'''unmung some xoxo with spaces in and check result is right'''
xoxoSample= '''<ol class='xoxo'>
<li>
<dl>
<dt>text</dt>
<dd>item 1</dd>
<dt>description</dt>
<dd> This item represents the main point we're trying to make.</dd>
<dt>url</dt>
<dd>http://example.com/more.xoxo</dd>
<dt>title</dt>
<dd>title of item 1</dd>
<dt>type</dt>
<dd>text/xml</dd>
<dt>rel</dt>
<dd>help</dd>
</dl>
</li>
</ol>'''
d = xoxo.fromXOXO(xoxoSample)
d2={'text':'item 1',
'description':" This item represents the main point we're trying to make.",
'url':'http://example.com/more.xoxo',
'title':'title of item 1',
'type':'text/xml',
'rel':'help'
}
xoxoAgain = xoxo.toXOXO(d)
self.assertEqual(d,d2)
#this needs a smarter whitespace-sensitive comparison
#self.assertEqual(xoxoSample,xoxoAgain)
def testSpecialAttributeDecoding(self):
'''unmung some xoxo with <a href=' rel= etc in and check result is right'''
xoxoSample= '''<ol class='xoxo'>
<li>
<dl>
<dt>text</dt>
<dd>item 1</dd>
<dt>url</dt>
<dd>http://example.com/more.xoxo</dd>
<dt>title</dt>
<dd>title of item 1</dd>
<dt>type</dt>
<dd>text/xml</dd>
<dt>rel</dt>
<dd>help</dd>
</dl>
</li>
</ol>'''
d = xoxo.fromXOXO(xoxoSample)
smartxoxoSample= '''<ol class='xoxo'>
<li><a href="http://example.com/more.xoxo"
title="title of item 1"
type="text/xml"
rel="help">item 1</a>
<!-- note how the "text" property is simply the contents of the <a> element -->
</li>
</ol>'''
d2 = xoxo.fromXOXO(smartxoxoSample)
self.assertEqual(d,d2)
def testSpecialAttributeAndDLDecoding(self):
'''unmung some xoxo with <a href=' rel= etc in plus a <dl> in the same item and check result is right'''
xoxoSample= '''<ol class="xoxo">
<li>
<dl>
<dt>text</dt>
<dd>item 1</dd>
<dt>description</dt>
<dd> This item represents the main point we're trying to make.</dd>
<dt>url</dt>
<dd>http://example.com/more.xoxo</dd>
<dt>title</dt>
<dd>title of item 1</dd>
<dt>type</dt>
<dd>text/xml</dd>
<dt>rel</dt>
<dd>help</dd>
</dl>
</li>
</ol>'''
d = xoxo.fromXOXO(xoxoSample)
smartxoxoSample= '''<ol class="xoxo">
<li><a href="http://example.com/more.xoxo"
title="title of item 1"
type="text/xml"
rel="help">item 1</a>
<!-- note how the "text" property is simply the contents of the <a> element -->
<dl>
<dt>description</dt>
<dd> This item represents the main point we're trying to make.</dd>
</dl>
</li>
</ol>'''
d2 = xoxo.fromXOXO(smartxoxoSample)
self.assertEqual(d,d2)
def testSpecialAttributeEncode(self):
'''check it makes an <a href with a url parameter'''
d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help','text':'an example'}
html=xoxo.toXOXO(d)
expectedHTML= '<ol class="xoxo"><li><a href="http://example.com/more.xoxo" title="sample url" rel="help" type="text/xml" >an example</a></li></ol>'
self.assertEqual(html,expectedHTML)
def testSpecialAttributeRoundTripFull(self):
'''check it makes an <a href with a url parameter'''
d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help','text':'an example'}
html=xoxo.toXOXO(d)
self.assertEqual(d,xoxo.fromXOXO(html))
def testSpecialAttributeRoundTripNoText(self):
'''check it makes an <a href with a url parameter and no text attribute'''
d={'url':'http://example.com/more.xoxo','title':'sample url','type':"text/xml",'rel':'help'}
html=xoxo.toXOXO(d)
self.assertEqual(d,xoxo.fromXOXO(html))
def testSpecialAttributeRoundTripNoTextOrTitle(self):
'''check it makes an <a href with a url parameter and no text or title attribute'''
d={'url':'http://example.com/more.xoxo'}
html=xoxo.toXOXO(d)
self.assertEqual(d,xoxo.fromXOXO(html))
def testUnicodeRoundtrip(self):
'''check unicode characters can go to xoxo and back'''
src=unicode('Tantek Çelik and a snowman ?','utf-8')
html = html=xoxo.toXOXO(src)
self.assertEqual(src,xoxo.fromXOXO(html))
if __name__ == "__main__":
unittest.main()
else:
runner = unittest.TextTestRunner()
suite = unittest.makeSuite(xoxoTestCases,'test')
runner.run(suite)