ubuntu-bots/Webcal/icalendar/parser.py

523 lines
17 KiB
Python
Raw Blame History

# -*- coding: latin-1 -*-
"""
This module parses and generates contentlines as defined in RFC 2445
(iCalendar), but will probably work for other MIME types with similar syntax.
Eg. RFC 2426 (vCard)
It is stupid in the sense that it treats the content purely as strings. No type
conversion is attempted.
Copyright, 2005: Max M <maxm@mxm.dk>
License: GPL (Just contact med if and why you would like it changed)
"""
# from python
from types import TupleType, ListType
SequenceTypes = [TupleType, ListType]
import re
# from this package
from icalendar.caselessdict import CaselessDict
#################################################################
# Property parameter stuff
def paramVal(val):
"Returns a parameter value"
if type(val) in SequenceTypes:
return q_join(val)
return dQuote(val)
# Could be improved
NAME = re.compile('[\w-]+')
UNSAFE_CHAR = re.compile('[\x00-\x08\x0a-\x1f\x7F",:;]')
QUNSAFE_CHAR = re.compile('[\x00-\x08\x0a-\x1f\x7F"]')
FOLD = re.compile('([\r]?\n)+[ \t]{1}')
def validate_token(name):
match = NAME.findall(name)
if len(match) == 1 and name == match[0]:
return
raise ValueError, name
def validate_param_value(value, quoted=True):
validator = UNSAFE_CHAR
if quoted:
validator = QUNSAFE_CHAR
if validator.findall(value):
raise ValueError, value
QUOTABLE = re.compile('[,;:].')
def dQuote(val):
"""
Parameter values containing [,;:] must be double quoted
>>> dQuote('Max')
'Max'
>>> dQuote('Rasmussen, Max')
'"Rasmussen, Max"'
>>> dQuote('name:value')
'"name:value"'
"""
if QUOTABLE.search(val):
return '"%s"' % val
return val
# parsing helper
def q_split(st, sep=','):
"""
Splits a string on char, taking double (q)uotes into considderation
>>> q_split('Max,Moller,"Rasmussen, Max"')
['Max', 'Moller', '"Rasmussen, Max"']
"""
result = []
cursor = 0
length = len(st)
inquote = 0
for i in range(length):
ch = st[i]
if ch == '"':
inquote = not inquote
if not inquote and ch == sep:
result.append(st[cursor:i])
cursor = i + 1
if i + 1 == length:
result.append(st[cursor:])
return result
def q_join(lst, sep=','):
"""
Joins a list on sep, quoting strings with QUOTABLE chars
>>> s = ['Max', 'Moller', 'Rasmussen, Max']
>>> q_join(s)
'Max,Moller,"Rasmussen, Max"'
"""
return sep.join([dQuote(itm) for itm in lst])
class Parameters(CaselessDict):
"""
Parser and generator of Property parameter strings. It knows nothing of
datatypes. It's main concern is textual structure.
Simple parameter:value pair
>>> p = Parameters(parameter1='Value1')
>>> str(p)
'PARAMETER1=Value1'
keys are converted to upper
>>> p.keys()
['PARAMETER1']
Parameters are case insensitive
>>> p['parameter1']
'Value1'
>>> p['PARAMETER1']
'Value1'
Parameter with list of values must be seperated by comma
>>> p = Parameters({'parameter1':['Value1', 'Value2']})
>>> str(p)
'PARAMETER1=Value1,Value2'
Multiple parameters must be seperated by a semicolon
>>> p = Parameters({'RSVP':'TRUE', 'ROLE':'REQ-PARTICIPANT'})
>>> str(p)
'ROLE=REQ-PARTICIPANT;RSVP=TRUE'
Parameter values containing ',;:' must be double quoted
>>> p = Parameters({'ALTREP':'http://www.wiz.org'})
>>> str(p)
'ALTREP="http://www.wiz.org"'
list items must be quoted seperately
>>> p = Parameters({'MEMBER':['MAILTO:projectA@host.com', 'MAILTO:projectB@host.com', ]})
>>> str(p)
'MEMBER="MAILTO:projectA@host.com","MAILTO:projectB@host.com"'
Now the whole sheebang
>>> p = Parameters({'parameter1':'Value1', 'parameter2':['Value2', 'Value3'],\
'ALTREP':['http://www.wiz.org', 'value4']})
>>> str(p)
'ALTREP="http://www.wiz.org",value4;PARAMETER1=Value1;PARAMETER2=Value2,Value3'
We can also parse parameter strings
>>> Parameters.from_string('PARAMETER1=Value 1;param2=Value 2')
Parameters({'PARAMETER1': 'Value 1', 'PARAM2': 'Value 2'})
Including empty strings
>>> Parameters.from_string('param=')
Parameters({'PARAM': ''})
We can also parse parameter strings
>>> Parameters.from_string('MEMBER="MAILTO:projectA@host.com","MAILTO:projectB@host.com"')
Parameters({'MEMBER': ['MAILTO:projectA@host.com', 'MAILTO:projectB@host.com']})
We can also parse parameter strings
>>> Parameters.from_string('ALTREP="http://www.wiz.org",value4;PARAMETER1=Value1;PARAMETER2=Value2,Value3')
Parameters({'PARAMETER1': 'Value1', 'ALTREP': ['http://www.wiz.org', 'value4'], 'PARAMETER2': ['Value2', 'Value3']})
"""
def params(self):
"""
in rfc2445 keys are called parameters, so this is to be consitent with
the naming conventions
"""
return self.keys()
### Later, when I get more time... need to finish this off now. The last majot thing missing.
### def _encode(self, name, value, cond=1):
### # internal, for conditional convertion of values.
### if cond:
### klass = types_factory.for_property(name)
### return klass(value)
### return value
###
### def add(self, name, value, encode=0):
### "Add a parameter value and optionally encode it."
### if encode:
### value = self._encode(name, value, encode)
### self[name] = value
###
### def decoded(self, name):
### "returns a decoded value, or list of same"
def __repr__(self):
return 'Parameters(' + dict.__repr__(self) + ')'
def __str__(self):
result = []
items = self.items()
items.sort() # To make doctests work
for key, value in items:
value = paramVal(value)
result.append('%s=%s' % (key.upper(), value))
return ';'.join(result)
def from_string(st, strict=False):
"Parses the parameter format from ical text format"
try:
# parse into strings
result = Parameters()
for param in q_split(st, ';'):
key, val = q_split(param, '=')
validate_token(key)
param_values = [v for v in q_split(val, ',')]
# Property parameter values that are not in quoted
# strings are case insensitive.
vals = []
for v in param_values:
if v.startswith('"') and v.endswith('"'):
v = v.strip('"')
validate_param_value(v, quoted=True)
vals.append(v)
else:
validate_param_value(v, quoted=False)
if strict:
vals.append(v.upper())
else:
vals.append(v)
if not vals:
result[key] = val
else:
if len(vals) == 1:
result[key] = vals[0]
else:
result[key] = vals
return result
except:
raise ValueError, 'Not a valid parameter string'
from_string = staticmethod(from_string)
#########################################
# parsing and generation of content lines
class Contentline(str):
"""
A content line is basically a string that can be folded and parsed into
parts.
>>> c = Contentline('Si meliora dies, ut vina, poemata reddit')
>>> str(c)
'Si meliora dies, ut vina, poemata reddit'
A long line gets folded
>>> c = Contentline(''.join(['123456789 ']*10))
>>> str(c)
'123456789 123456789 123456789 123456789 123456789 123456789 123456789 1234\\r\\n 56789 123456789 123456789 '
A folded line gets unfolded
>>> c = Contentline.from_string(str(c))
>>> c
'123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789 123456789 '
We do not fold within a UTF-8 character:
>>> c = Contentline('This line has a UTF-8 character where it should be folded. Make sure it g\xc3\xabts folded before that character.')
>>> '\xc3\xab' in str(c)
True
Don't fail if we fold a line that is exactly X times 74 characters long:
>>> c = str(Contentline(''.join(['x']*148)))
It can parse itself into parts. Which is a tuple of (name, params, vals)
>>> c = Contentline('dtstart:20050101T120000')
>>> c.parts()
('dtstart', Parameters({}), '20050101T120000')
>>> c = Contentline('dtstart;value=datetime:20050101T120000')
>>> c.parts()
('dtstart', Parameters({'VALUE': 'datetime'}), '20050101T120000')
>>> c = Contentline('ATTENDEE;CN=Max Rasmussen;ROLE=REQ-PARTICIPANT:MAILTO:maxm@example.com')
>>> c.parts()
('ATTENDEE', Parameters({'ROLE': 'REQ-PARTICIPANT', 'CN': 'Max Rasmussen'}), 'MAILTO:maxm@example.com')
>>> str(c)
'ATTENDEE;CN=Max Rasmussen;ROLE=REQ-PARTICIPANT:MAILTO:maxm@example.com'
and back again
>>> parts = ('ATTENDEE', Parameters({'ROLE': 'REQ-PARTICIPANT', 'CN': 'Max Rasmussen'}), 'MAILTO:maxm@example.com')
>>> Contentline.from_parts(parts)
'ATTENDEE;CN=Max Rasmussen;ROLE=REQ-PARTICIPANT:MAILTO:maxm@example.com'
and again
>>> parts = ('ATTENDEE', Parameters(), 'MAILTO:maxm@example.com')
>>> Contentline.from_parts(parts)
'ATTENDEE:MAILTO:maxm@example.com'
A value can also be any of the types defined in PropertyValues
>>> from icalendar.prop import vText
>>> parts = ('ATTENDEE', Parameters(), vText('MAILTO:test@example.com'))
>>> Contentline.from_parts(parts)
'ATTENDEE:MAILTO:test@example.com'
A value can also be unicode
>>> from icalendar.prop import vText
>>> parts = ('SUMMARY', Parameters(), vText(u'INternational char <20> <20> <20>'))
>>> Contentline.from_parts(parts)
'SUMMARY:INternational char \\xc3\\xa6 \\xc3\\xb8 \\xc3\\xa5'
Traversing could look like this.
>>> name, params, vals = c.parts()
>>> name
'ATTENDEE'
>>> vals
'MAILTO:maxm@example.com'
>>> for key, val in params.items():
... (key, val)
('ROLE', 'REQ-PARTICIPANT')
('CN', 'Max Rasmussen')
And the traditional failure
>>> c = Contentline('ATTENDEE;maxm@example.com')
>>> c.parts()
Traceback (most recent call last):
...
ValueError: Content line could not be parsed into parts
Another failure:
>>> c = Contentline(':maxm@example.com')
>>> c.parts()
Traceback (most recent call last):
...
ValueError: Content line could not be parsed into parts
>>> c = Contentline('key;param=:value')
>>> c.parts()
('key', Parameters({'PARAM': ''}), 'value')
>>> c = Contentline('key;param="pvalue":value')
>>> c.parts()
('key', Parameters({'PARAM': 'pvalue'}), 'value')
Should bomb on missing param:
>>> c = Contentline.from_string("k;:no param")
>>> c.parts()
Traceback (most recent call last):
...
ValueError: Content line could not be parsed into parts
>>> c = Contentline('key;param=pvalue:value', strict=False)
>>> c.parts()
('key', Parameters({'PARAM': 'pvalue'}), 'value')
If strict is set to True, uppercase param values that are not
double-quoted, this is because the spec says non-quoted params are
case-insensitive.
>>> c = Contentline('key;param=pvalue:value', strict=True)
>>> c.parts()
('key', Parameters({'PARAM': 'PVALUE'}), 'value')
>>> c = Contentline('key;param="pValue":value', strict=True)
>>> c.parts()
('key', Parameters({'PARAM': 'pValue'}), 'value')
"""
def __new__(cls, st, strict=False):
self = str.__new__(cls, st)
setattr(self, 'strict', strict)
return self
def from_parts(parts):
"Turns a tuple of parts into a content line"
(name, params, values) = [str(p) for p in parts]
try:
if params:
return Contentline('%s;%s:%s' % (name, params, values))
return Contentline('%s:%s' % (name, values))
except:
raise ValueError(
'Property: %s Wrong values "%s" or "%s"' % (repr(name),
repr(params),
repr(values)))
from_parts = staticmethod(from_parts)
def parts(self):
""" Splits the content line up into (name, parameters, values) parts
"""
try:
name_split = None
value_split = None
inquotes = 0
for i in range(len(self)):
ch = self[i]
if not inquotes:
if ch in ':;' and not name_split:
name_split = i
if ch == ':' and not value_split:
value_split = i
if ch == '"':
inquotes = not inquotes
name = self[:name_split]
if not name:
raise ValueError, 'Key name is required'
validate_token(name)
if name_split+1 == value_split:
raise ValueError, 'Invalid content line'
params = Parameters.from_string(self[name_split+1:value_split],
strict=self.strict)
values = self[value_split+1:]
return (name, params, values)
except:
raise ValueError, 'Content line could not be parsed into parts'
def from_string(st, strict=False):
"Unfolds the content lines in an iCalendar into long content lines"
try:
# a fold is carriage return followed by either a space or a tab
return Contentline(FOLD.sub('', st), strict=strict)
except:
raise ValueError, 'Expected StringType with content line'
from_string = staticmethod(from_string)
def __str__(self):
"Long content lines are folded so they are less than 75 characters wide"
l_line = len(self)
new_lines = []
start = 0
end = 74
while True:
if end >= l_line:
end = l_line
else:
# Check that we don't fold in the middle of a UTF-8 character:
# http://lists.osafoundation.org/pipermail/ietf-calsify/2006-August/001126.html
while True:
char_value = ord(self[end])
if char_value < 128 or char_value >= 192:
# This is not in the middle of a UTF-8 character, so we
# can fold here:
break
else:
end -= 1
new_lines.append(self[start:end])
if end == l_line:
# Done
break
start = end
end = start + 74
return '\r\n '.join(new_lines)
class Contentlines(list):
"""
I assume that iCalendar files generally are a few kilobytes in size. Then
this should be efficient. for Huge files, an iterator should probably be
used instead.
>>> c = Contentlines([Contentline('BEGIN:VEVENT\\r\\n')])
>>> str(c)
'BEGIN:VEVENT\\r\\n'
Lets try appending it with a 100 charater wide string
>>> c.append(Contentline(''.join(['123456789 ']*10)+'\\r\\n'))
>>> str(c)
'BEGIN:VEVENT\\r\\n\\r\\n123456789 123456789 123456789 123456789 123456789 123456789 123456789 1234\\r\\n 56789 123456789 123456789 \\r\\n'
Notice that there is an extra empty string in the end of the content lines.
That is so they can be easily joined with: '\r\n'.join(contentlines)).
>>> Contentlines.from_string('A short line\\r\\n')
['A short line', '']
>>> Contentlines.from_string('A faked\\r\\n long line\\r\\n')
['A faked long line', '']
>>> Contentlines.from_string('A faked\\r\\n long line\\r\\nAnd another lin\\r\\n\\te that is folded\\r\\n')
['A faked long line', 'And another line that is folded', '']
"""
def __str__(self):
"Simply join self."
return '\r\n'.join(map(str, self))
def from_string(st):
"Parses a string into content lines"
try:
# a fold is carriage return followed by either a space or a tab
unfolded = FOLD.sub('', st)
lines = [Contentline(line) for line in unfolded.splitlines() if line]
lines.append('') # we need a '\r\n' in the end of every content line
return Contentlines(lines)
except:
raise ValueError, 'Expected StringType with content lines'
from_string = staticmethod(from_string)
# ran this:
# sample = open('./samples/test.ics', 'rb').read() # binary file in windows!
# lines = Contentlines.from_string(sample)
# for line in lines[:-1]:
# print line.parts()
# got this:
#('BEGIN', Parameters({}), 'VCALENDAR')
#('METHOD', Parameters({}), 'Request')
#('PRODID', Parameters({}), '-//My product//mxm.dk/')
#('VERSION', Parameters({}), '2.0')
#('BEGIN', Parameters({}), 'VEVENT')
#('DESCRIPTION', Parameters({}), 'This is a very long description that ...')
#('PARTICIPANT', Parameters({'CN': 'Max M'}), 'MAILTO:maxm@mxm.dk')
#('DTEND', Parameters({}), '20050107T160000')
#('DTSTART', Parameters({}), '20050107T120000')
#('SUMMARY', Parameters({}), 'A second event')
#('END', Parameters({}), 'VEVENT')
#('BEGIN', Parameters({}), 'VEVENT')
#('DTEND', Parameters({}), '20050108T235900')
#('DTSTART', Parameters({}), '20050108T230000')
#('SUMMARY', Parameters({}), 'A single event')
#('UID', Parameters({}), '42')
#('END', Parameters({}), 'VEVENT')
#('END', Parameters({}), 'VCALENDAR')