# -*- coding: UTF-8 -*-
""" Copyright (C) 2003 Peter Ohler

    XMLite is free software; you can redistribute it and/or modify it under
    the terms of the GNU General Public License as published by the Free
    Software Foundation; either version 2, or (at your option) any later
    version.

    XMLite is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You download a copy of the GNU General Public License at
    http://www.gnu.org/licenses/gpl.txt or obtain a copy of the GNU General
    Public License by writing to the Free Software Foundation, Inc., 59 Temple
    Place - Suite 330, Boston, MA 02111-1307, USA.

    XMLite - extremely light weight XML parse and printer

    The xmlite module is an extremely light weight XML parser and printer. It
    does not use the DOM or SAX interfaces but instead works with a simple
    list or rather nested lists to represent an XML document. The parser takes
    as input a string or filename and returns a list with all the elements of
    the XML file.

    The first item in the top level XML list is a dict object with 'version',
    'encoding', and 'standalone' keys. If there are any decl tags such as
    'DOCTYPE' they will be next in the list and will be tuples with the decl
    tag name and the value of the tag as the second item of the tuple.

    Comments are included as lists of two items. The first item is None and
    the second is a string which is the comment text.

    CDATA are tuples of two items. The first item is 'CDATA' and the second is
    the CDATA content.

    XML elements are lists. The first item in the list is the element tag or
    name. The second item is a dict object with includes all the attributes of
    the element. Any remainin list items are either comments, strings, CDATA,
    or more elements as lists.

    Author: Peter Ohler, peter@ohler.com
    $Id: xmlite.py,v 1.1.1.1 2004/05/12 09:26:11 sam Exp $
"""

import os
import sys
import string

class XmlException:
    """ XML Exception for reporting errors in parsing of an XML file or
        string.
    """

    def __init__(self, msg, s, pos):
        """ Pass in the error message, string being parsed, and the position
            in the string where the error was detected.
        """
        self.msg = msg
        if s == None:
            self.line = -1
            self.char = -1
        else:
            self.line = 1 + s.count(os.linesep, 0, pos)
            if self.line > 1:
                self.char = pos - s.rfind(os.linesep, 0, pos)
            else:
                self.char = pos
    def __str__(self):
        if self.line < 0:
            return self.msg
        else:
            return "%s at %d of line %d" % (self.msg, self.char, self.line)
    
def printXml(xml, indent = 0):
    """ Print out a list that matches the expected XML list format. Other
        formats may not print out correctly. The output format is XML.
    """
    istr = ' ' * indent
    if isinstance(xml, str):
        print "%s%s" % (istr, expandCodedChars(xml))
    elif isinstance(xml, tuple):
        if 'CDATA' == xml[0]:
            print "%s<![CDATA[%s]]>" % (istr, xml[1])
        else:
            print "%s<!%s %s>" % (istr, xml[0], xml[1])
    elif isinstance(xml, list):
        tag = xml[0]
        if tag == None:
            print "%s<!-- %s -->" % (istr, xml[1])
            return
        elif isinstance(tag, dict):    # the very top of the xml
            print "<?xml",
            for k in tag:
                v = tag[k]
                if v != None:
                    print '%s="%s"' % (k, v),
            print "?>"
            indent += 2
            for e in xml[1:]:
                printXml(e, indent)
            return
        n = len(xml)
        if n == 1:
            print "%s<%s/>" % (istr, tag)
        elif n == 2:
            attrs = xml[1]
            if attrs == None:
                print "%s<%s/>" % (istr, tag)
            else:
                print "%s<%s" % (istr, tag)
                printAttrs(xml[1], indent + 3)
                print "/>"
        else:
            attrs = xml[1]
            if attrs == None:
                print "%s<%s>" % (istr, tag)
            else:
                print "%s<%s" % (istr, tag)
                printAttrs(xml[1], indent + 3)
                print ">"
            indent += 2
            for e in xml[2:]:
                printXml(e, indent)
            print "%s</%s>" % (istr, tag)
    else:
        raise XmlException("Invalid format", None, 0)

def printAttrs(attrs, indent):
    if not isinstance(attrs, dict):
        if attrs == None:
            return
        raise XmlException("Invalid format", s, i)
    istr = ' ' * indent
    n = len(attrs)
    for a in attrs:
        n -= 1
        # Strings are expanded and special characters are replaces with
        # character codes.
        if 0 < n:
            print '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a]))
        else:
            print '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a])),
        
def toStr(xml, s = "", indent = 0):
    """ Return a string that is an XML document.
    """
    istr = ' ' * indent
    if isinstance(xml, str):
        s = s + "%s%s\n" % (istr, xml)
        return s

    if not isinstance(xml, list):
        raise XmlException("Invalid format", s, i)
    tag = xml[0]
    if tag == None:
        s = s + "%s<!-- %s -->\n" % (istr, xml[1])
        return s
    n = len(xml)
    if n == 1:
        s = s + "%s<%s/>\n" % (istr, tag)
    elif n == 2:
        attrs = xml[1]
        if attrs == None:
            s = s + "%s<%s/>\n" % (istr, tag)
        else:
            s = s + "%s<%s\n" % (istr, tag)
            s = attrsToStr(xml[1], s, indent + 3)
            s = s + "/>\n"
    else:
        attrs = xml[1]
        if attrs == None:
            s = s + "%s<%s>\n" % (istr, tag)
        else:
            s = s + "%s<%s\n" % (istr, tag)
            s = attrsToStr(xml[1], s, indent + 3)
            s = s + ">\n"
        indent = indent + 2
        for e in xml[2:]:
            s = toStr(e, s, indent)
        s = s + "%s</%s>\n" % (istr, tag)
    return s

def attrsToStr(attrs, s, indent):
    if not isinstance(attrs, dict):
        if attrs == None:
            return s
        raise XmlException("Invalid format", s, i)
    istr = ' ' * indent
    n = len(attrs)
    for a in attrs:
        n -= 1
        # Strings are expanded and special characters are replaces with
        # character codes.
        if 0 < n:
            s = s + '%s%s="%s"\n' % (istr, a, expandCodedChars(attrs[a]))
        else:
            s = s + '%s%s="%s"' % (istr, a, expandCodedChars(attrs[a]))
    return s

def load(filename):
    """ Load complete file into memory and then parse the string.
    """
    f = open(filename, "r")
    if f == None:
        return None
    s = f.read()
    f.close()

    return parse(s)

def parse(s):
    """ Make one pass and parse directly into an XML list.
    """
    phase = 0    # 0 - before prolog, 1 - after prolog, 2 - after root, 3 - done
    x = []
    i = 0

    # let an IndexError be raised if the end of the string is reached
    while 3 > phase:
        try:
            while s[i] in string.whitespace:
                i += 1
        except IndexError:
            break
        # every element at the top level starts with '<'
	if '<' != s[i]:
            raise XmlException("Expected a '<' character", s, i)
        i += 1
        c = s[i]
        if c == '?':    # prolog
            if phase != 0:
                raise XmlException("Prolog must be the first element", s, i)
            i += 1
            i = readProlog(s, i, x)
            phase = 1
        elif c == '!':  # comment or decl
            i += 1
            if '--' == s[i:i + 2]:
                i = readComment(s, i + 2, x)
            elif phase > 1:
                raise XmlException("DECLs must appear before other element", s, i)
            else:
                i = readDecl(s, i, x)
            phase = 1
        else:           # element
            i = readElement(s, i, x)
            phase = 2
    return x

def readProlog(s, i, x):
    version, encoding, standalone = None, None, None
    
    if 'xml' != s[i:i + 3]:
        raise XmlException("Expected 'xml' in prolog", s, i)
    i += 3
    while '?' != s[i]:
        token, i = readNameToken(s, i)

        while s[i] in string.whitespace:
            i += 1
        c = s[i]
        if '=' == c:
            i += 1
            if token == "version":
                version, i = readQuotedValue(s, i)
            elif token == "encoding":
                encoding, i = readQuotedValue(s, i)
            elif token == "standalone":
                standalone, i = readQuotedValue(s, i)
            else:
                raise XmlException("Invalid prolog attribute: '" + token + "'", s, i)
        elif '?' == c:
            break
        else:
            raise XmlException("Expected '=' or '?' in prolog", s, i)
        
    i += 1    # past ?
    if '>' != s[i]:
        raise XmlException("Expected '>' after '?' in prolog", s, i)
    i += 1
    x.append({ 'version': version, 'encoding': encoding, 'standalone': standalone })

    return i

nonNameStr = " \t\n\r?=/><\x0b\x0c"

def readNameToken(s, i):
    while s[i] in string.whitespace:
        i += 1
    start = i
    while not s[i] in nonNameStr:
        i += 1
    if start == i:
        return None, i
    return s[start:i], i

def readQuotedValue(s, i):
    while s[i] in string.whitespace:
        i += 1
    if '"' != s[i]:
        raise XmlException("Expected '\"' character", s, i)
    i += 1
    start = i
    while '"' != s[i]:
        i += 1
    if start == i:
        return None, i + 1
    return replaceCodedChars(s[start:i]), i + 1
    
def readComment(s, i, x):
    end = s.find('-->', i)
    if 0 > end:
        raise XmlException("Comment not terminated", s, i)
    x.append([None, s[i:end].strip()])
    
    return end + 3

def readDecl(s, i, x):
    name, i = readNameToken(s, i)
    while s[i] in string.whitespace:
        i += 1
    start = i
    depth = 1
    while 1:
        c = s[i]
        if '<' == c:
            depth += 1
        elif '>' == c:
            depth -= 1
            if depth == 0:    # done, the end
                break
        i += 1
    x.append((name, s[start:i]))
    
    return i + 1

def readElement(s, i, x):
    name, i = readNameToken(s, i)
    element = [name, None]
    
    while s[i] in string.whitespace:
        i += 1
    if '/' == s[i]:
        i += 1
        if '>' == s[i]:    # empty element, no attributes and no children
            x.append(element)
            return i + 1
        raise XmlException("Expected '>' after '/'", s, i)

    # read attribute names until the close (/ or >) is reached
    dict = None
    while 1:
        name, i = readNameToken(s, i)
        while s[i] in string.whitespace:
            i += 1
        c = s[i]
        i += 1
        if '=' == c:
            while s[i] in string.whitespace:
                i += 1
            value, i = readQuotedValue(s, i)
            if dict == None:
                dict = { name : value }
                element[1] = dict
            else:
                dict[name] = value
        elif '/' == c:
            if '>' != s[i]:
                raise XmlException("Expected '>' after '/'", s, i)
            # no children
            i += 1
            x.append(element)
            return i
        elif '>' == c:
            break
        else:
            raise XmlException("Format error", s, i)

    # read children
    while 1:
        while s[i] in string.whitespace:
            i += 1
        if '<' == s[i]:
            i += 1
            c = s[i]
            if '!' == c:    # better be a comment or CDATA
                i += 1
                if '--' == s[i:i + 2]:
                    i = readComment(s, i + 2, element)
                elif '[CDATA[' == s[i:i + 7]:
                    i = readCData(s, i + 1, element)
                else:
                    raise XmlException("Comment format error", s, i)
            elif '/' == c:  # end of element
                i += 1
                name, i = readNameToken(s, i)
                while s[i] in string.whitespace:
                    i += 1
                if '>' != s[i]:
                    raise XmlException("Expected '>' to close element end tag", s, i)
                if name != element[0]:
                    raise XmlException("Element end tag name mismatch", s, i)
                i += 1
                break
            else:           # read sub element
                i = readElement(s, i, element)
        else:
            i = readText(s, i, element)
        
    x.append(element)
    return i

def readCData(s, i, x):
    start = i
    end = s.find(']]>', i)
    if 0 > end:
        raise XmlException("No CDATA closure", s, i)
    x.append(('CDATA', s[start:end]))

    return end + 3

def readText(s, i, x):
    start = i
    end = s.find('<', i)
    if 0 > end:
        raise XmlException("No text closure", s, i)
    x.append(replaceCodedChars(s[start:end].strip()))

    return end

def replaceCodedChars(text):
    if '&' in text:
        newtext = ""
        t = 0
        tend = len(text)
        prev = 0
        while 1:
            t = text.find('&', t)
            if 0 > t:
                newtext += text[prev:]
                text = newtext
                break
            else:
                c, i = readCodedChar(text, t)
                newtext += text[prev:t] + c
                t = i
                prev = t
    return text
    
def readCodedChar(s, i):
    end = s.find(';', i, i + 6)
    if 0 > end:
        raise XmlException("Invalid coded character. Not terminated by ';'", None, -1)
    i += 1
    if '#' == s[i]:
        c = chr(int(s[i + 1: end]))
    else:
        code = s[i:end]
        if 'nbsp' == code:
            c = ' '
        elif 'lt' == code:
            c = '<'
        elif 'gt' == code:
            c = '>'
        elif 'amp' == code:
            c = '&'
        elif 'quot' == code:
            c = '"'
        elif 'apos' == code:
            c = "'"
        else:
            raise XmlException("Invalid coded character '%s'" % code, None, -1)

    return c, end + 1

def expandCodedChars(s):
    # todo handle nbsp correctly, needs read adjustment for strip
    if 0 < s.find('  '):
        s = s.replace('  ', ' &nbsp;')
        s = s.replace('&nbsp; ', '&nbsp;&nbsp;')

    if '<' in s:
        s = s.replace('<', '&lt;')
    if '>' in s:
        s = s.replace('>', '&gt;')
    if '&' in s:
        s = s.replace('>', '&amp;')
    if '"' in s:
        s = s.replace('>', '&quot;')
    if "'" in s:
        s = s.replace('>', '&apos;')

    return s
