Python Facile

Source : xhtmllib.py

 
"""Un parseur pour XHTML 1.0 Strict, version 0.3.0"""

# Author: lionel Grolleau

import re
import pprint
from xmllib import XMLParser

floatnumber = re.compile(r'^[-+]?\d*(?:[.,]\d*)?([eE][-+]?\d+)?$')
intnumber = re.compile(r'^[-+]?\d*$')

class XHTMLParser(XMLParser):

    from htmlentitydefs import entitydefs
    entitydefs['lt'] = '<'  # must use charref
    entitydefs['gt'] = '>'
    entitydefs['amp'] = '&' # must use charref
    entitydefs['quot'] = '"'
    entitydefs['apos'] = '''

    def __init__(self, verbose=0, warning=0):
        self.warning = warning
        self.verbose = verbose
        XMLParser.__init__(self)

    def reset(self):
        # extend (called by XMLParser.__init__)
        self._tonumeric = 1 # transforme une chaine 'numerique' en un 'int' ou un 'float'
        self._toboolean = 0 # transforme une chaine 'true' ou 'false' en 1 ou 0
        self._boolean = {'true': 1, 'false': 0, 'vrai': 1, 'faux': 0}
        #
        self.__XHTMLdata = "" # internal
        self.__skip2endtag = None # internal
        self.__format = 1    # internal
        #
        XMLParser.reset(self)

    def format_data(self):
        """ formate data

        formate data sauf si <pre> tag
        enleve les espaces excessifs
        chaine numerique => int ou float
        chaine 'true' ou 'false' => 1 or 0"""
        data = self.__XHTMLdata
        self.__XHTMLdata = ""
        if self.__format:
            data = " ".join(data.split())
            if data and self._tonumeric:
                # test for numerical string
                if data[0] in "+-0123456789.,":
                    if intnumber.match(data):
                        return int(data)
                    elif floatnumber.match(data):
                        if data != '.' and data != ',': # escape bug : floatnumber match '.' and ','
                            return float(data.replace(',','.'))
            if data and self._toboolean:
                try:
                    return self._boolean[data.lower()]
                except KeyError: pass
        return data

    def flush(self, dataname='DATA'):
        if self.__XHTMLdata:
            data = self.format_data()
            if data != '' and self.verbose:
                print '%s: [%s]' % (dataname, `data`)
            return data
        else: return ''

    def known_starttag(self, attrs):
        data = self.flush()
        if self.verbose:
            tag = self.stack[-1][0]
            if not attrs:
                print "start tag: <%s>" % (tag)
            else:
                strattrs = "".join([' %s="%s"' % (key.split()[-1], value) for key, value in attrs.items()])
                print "start tag: <%s%s>" % (tag, strattrs)
        return data

    def known_endtag(self, dataname='DATA'):
        data = self.flush(dataname)
        if self.verbose:
            tag = self.stack[-1][0]
            print "end tag: </%s>" % (tag)
        return data

    def known_dotag(self, attrs):
        data = self.flush()
        if self.verbose:
            tag = self.stack[-1][0]
            if not attrs:
                print "tag: <%s />" % (tag)
            else:
                strattrs = "".join([' %s="%s"' % (key.split()[-1], value) for key, value in attrs.items()])
                print "tag: <%s%s />" % (tag, strattrs)
        return data

    def skip_starttag(self, attrs):
        data = self.known_starttag(attrs)
        self.__skip2endtag = self.stack[-1][-1] # namespaces + 'tag'
        return data

    def skip_endtag(self, dataformat):
        self.__XHTMLdata = ""
        tag = self.stack[-1][0] # tag seul
        if self.verbose: print "data: (%s) Skiped\nend tag: </%s>" % (dataformat, tag)
        return ''

    def close(self):
        XMLParser.close(self)
        return self.flush()

    def handle_starttag(self, tag, method, attrs):
        if self.__skip2endtag == None:
            method(attrs)

    def handle_endtag(self, tag, method):
        if self.__skip2endtag == None:
            method()
        elif self.__skip2endtag == tag:
            self.__skip2endtag = None
            method()

    def handle_data(self, data):
        self.__XHTMLdata = self.__XHTMLdata + data

    def handle_comment(self, data):
        if self.__skip2endtag == None:
            data = self.flush()
            if self.verbose:
                r = `data`
                if len(r) > 68:
                    r = r[:32] + '...' + r[-32:]
                print 'comment:', r
            return data
        else: return ''

    def handle_xml(self, encoding, standalone):
        if self.__skip2endtag == None:
            data = self.flush()
            if self.verbose:
                print 'xml: encoding= %s standalone= %s' % (encoding, standalone)
            return data
        else: return ''

    def handle_doctype(self, tag, pubid, syslit, data):
        if self.__skip2endtag == None:
            data = self.flush()
            if self.verbose:
                print 'DOCTYPE: %s %s\n  pubid= %s\n  syslit= %s' % (tag, data, pubid, syslit)
            return data
        else: return ''

    def handle_cdata(self, data):
        if self.__skip2endtag == None:
            data = self.flush()
            if self.verbose:
                print 'cdata: %s' % (data)
            return data
        else: return ''

    def handle_proc(self, name, data):
        if self.__skip2endtag == None:
            data = self.flush()
            if self.verbose:
                print 'processing: name= %s\n  data= %s' % (name, data)
            return data
        else: return ''

    def syntax_error(self, message):
        print 'error at line %d: %s' % (self.lineno, message)

    def unknown_starttag(self, tag, attrs):
        if self.elements.has_key(self.stack[-1][0]):
            self.handle_starttag(tag, self.elements[self.stack[-1][0]][0], attrs)
            return
        if self.warning:
            tag = self.stack[-1][0]
            if not attrs:
                print "*** unknown start tag: <%s>" % tag
            else:
                strattrs = "".join([' %s="%s"' % (key.split()[-1], value) for key, value in attrs.items()])
                print "*** unknown start tag: <%s%s>" % (tag, strattrs)

    def unknown_endtag(self, tag):
        if self.elements.has_key(self.stack[-1][0]):
            self.handle_endtag(tag, self.elements[self.stack[-1][0]][1])
            return
        if self.warning:
            tag = self.stack[-1][0]
            print "*** unknown end tag: </%s>" % tag

    def unknown_entityref(self, ref):
        if self.warning: print "*** unknown entity ref: &%s;" % ref

    def unknown_charref(self, ref):
        if self.warning: print "*** unknown char ref: &#%s;" % ref

    # Document Structure
    def start_html(self,attrs): self.known_starttag(attrs)
    def end_html(self): self.known_endtag()

    # Document Head
    def start_head(self,attrs): self.known_starttag(attrs)
    def end_head(self): self.known_endtag()
    def start_title(self,attrs): self.known_starttag(attrs)
    def end_title(self): self.known_endtag('Document title')
    def start_base(self,attrs): pass # tag ignored by parser
    def end_base(self): pass # do_base
    def start_meta(self,attrs): pass # tag ignored by parser
    def end_meta(self): pass # do_meta
    def start_link(self,attrs): pass # tag ignored by parser
    def end_link(self): pass # do_link
    def start_style(self,attrs): self.skip_starttag(attrs) # Skip (#PCDATA) between <style> </style>
    def end_style(self): self.skip_endtag('#PCDATA') # Skip (#PCDATA) between <style> </style>
    def start_script(self,attrs): self.skip_starttag(attrs) # Skip (#PCDATA) between <script> </script>
    def end_script(self): self.skip_endtag('#PCDATA') # Skip (#PCDATA) between <script> </script>
    def start_noscript(self,attrs): pass # tag ignored by parser
    def end_noscript(self): pass # tag ignored by parser

    # Document Body
    def start_body(self,attrs): self.known_starttag(attrs)
    def end_body(self): self.known_endtag()
    def start_div(self,attrs): self.known_starttag(attrs)
    def end_div(self): self.known_endtag()

    # Paragraphs
    def start_p(self,attrs): self.known_starttag(attrs)
    def end_p(self): self.known_endtag()

    # Headings
    def start_h1(self,attrs): self.known_starttag(attrs)
    def end_h1(self): self.known_endtag('h1')
    def start_h2(self,attrs): self.known_starttag(attrs)
    def end_h2(self): self.known_endtag('h2')
    def start_h3(self,attrs): self.known_starttag(attrs)
    def end_h3(self): self.known_endtag('h3')
    def start_h4(self,attrs): self.known_starttag(attrs)
    def end_h4(self): self.known_endtag('h4')
    def start_h5(self,attrs): self.known_starttag(attrs)
    def end_h5(self): self.known_endtag('h5')
    def start_h6(self,attrs): self.known_starttag(attrs)
    def end_h6(self): self.known_endtag('h6')

    # Lists
    def start_ul(self,attrs): self.known_starttag(attrs)
    def end_ul(self): self.known_endtag()
    def start_ol(self,attrs): self.known_starttag(attrs)
    def end_ol(self): self.known_endtag()
    def start_li(self,attrs): self.known_starttag(attrs)
    def end_li(self): self.known_endtag('li DATA')
    def start_dl(self,attrs): self.known_starttag(attrs)
    def end_dl(self): self.known_endtag()
    def start_dt(self,attrs): self.known_starttag(attrs)
    def end_dt(self): self.known_endtag()
    def start_dd(self,attrs): self.known_starttag(attrs)
    def end_dd(self): self.known_endtag()

    # Adress
    def start_address(self, attrs): pass # tag ignored by parser
    def end_address(self): pass # tag ignored by parser

    # Horizontal Rule
    def start_hr(self,attrs): self.known_dotag(attrs)
    def end_hr(self): pass # do_hr

    # Preformatted Text
    def start_pre(self, attrs):
        self.known_starttag(attrs)
        self.__format = 0
    def end_pre(self):
        self.known_endtag('pre DATA')
        self.__format = 1

    # Blocklike Quotes
    def start_blockquote(self, attrs): pass # tag ignored by parser
    def end_blockquote(self): pass # tag ignored by parser

    # Inserted/Deleted Text
    def start_ins(self,attrs): pass # tag ignored by parser
    def end_ins(self): pass # tag ignored by parser
    def start_del(self,attrs): pass # tag ignored by parser
    def end_del(self): pass # tag ignored by parser

    # Anchor Element
    def start_a(self,attrs): self.known_starttag(attrs)
    def end_a(self): self.know_endtag()

    # Inline Elements
    def start_span(self,attrs): pass # tag ignored by parser
    def end_span(self): pass # tag ignored by parser
    def start_bdo(self,attrs): pass # tag ignored by parser
    def end_bdo(self): pass # tag ignored by parser
    def start_br(self,attrs): pass # tag ignored by parser
    def end_br(self): pass # do_br
    def start_em(self,attrs): pass # tag ignored by parser
    def end_em(self): pass # tag ignored by parser
    def start_strong(self,attrs): pass # tag ignored by parser
    def end_strong(self): pass # tag ignored by parser
    def start_dfn(self, attrs): pass # tag ignored by parser
    def end_dfn(self): pass # tag ignored by parser
    def start_code(self, attrs): pass # tag ignored by parser
    def end_code(self): pass # tag ignored by parser
    def start_samp(self, attrs): pass # tag ignored by parser
    def end_samp(self): pass # tag ignored by parser
    def start_kbd(self, attrs): pass # tag ignored by parser
    def end_kbd(self): pass # tag ignored by parser
    def start_var(self, attrs): pass # tag ignored by parser
    def end_var(self): pass # tag ignored by parser
    def start_cite(self,attrs): pass # tag ignored by parser
    def end_cite(self): pass # tag ignored by parser
    def start_abbr(self, attrs): pass # tag ignored by parser
    def end_abbr(self): pass # tag ignored by parser
    def start_acronym(self, attrs): pass # tag ignored by parser
    def end_acronym(self): pass # tag ignored by parser
    def start_q(self, attrs): pass # tag ignored by parser
    def end_q(self): pass # tag ignored by parser
    def start_sub(self, attrs): pass # tag ignored by parser
    def end_sub(self): pass # tag ignored by parser
    def start_sup(self, attrs): pass # tag ignored by parser
    def end_sup(self): pass # tag ignored by parser
    def start_tt(self, attrs): pass # tag ignored by parser
    def end_tt(self): pass # tag ignored by parser
    def start_i(self, attrs): pass # tag ignored by parser
    def end_i(self): pass # tag ignored by parser
    def start_b(self, attrs): pass # tag ignored by parser
    def end_b(self): pass # tag ignored by parser
    def start_big(self, attrs): pass # tag ignored by parser
    def end_big(self): pass # tag ignored by parser
    def start_small(self, attrs): pass # tag ignored by parser
    def end_small(self): pass # tag ignored by parser

    # Object
    def start_object(self, attrs): self.skip_starttag(attrs) # Skip (object.content) between <object> </object>
    def end_object(self): self.skip_endtag('object.content') # Skip (object.content) between <object> </object>
    def start_param(self, attrs): pass # tag ignored by parser
    def end_param(self): pass # do_param

    # Images
    def start_img(self, attrs): self.known_dotag(attrs)
    def end_img(self): pass # do_img

    # Client-side image maps
    def start_map(self, attrs): pass # tag ignored by parser
    def end_map(self): pass # tag ignored by parser
    def start_aera(self, attrs): pass # tag ignored by parser
    def end_aera(self): pass # do_aera

    # Forms
    def start_form(self,attrs): self.skip_starttag(attrs) # Skip (form.content) between <form> </form>
    def end_form(self): self.skip_endtag('form.content') # Skip (form.content) between <form> </form>
    def start_label(self,attrs): pass # tag ignored by parser
    def end_label(self): pass # tag ignored by parser
    def start_input(self,attrs): pass # tag ignored by parser
    def end_input(self): pass # do_input
    def start_select(self,attrs): pass # tag ignored by parser
    def end_select(self): pass # tag ignored by parser
    def start_optgroup(self,attrs): pass # tag ignored by parser
    def end_optgroup(self): pass # tag ignored by parser
    def start_option(self,attrs): pass # tag ignored by parser
    def end_option(self): pass # tag ignored by parser
    def start_textarea(self,attrs): pass # tag ignored by parser
    def end_textarea(self): pass # tag ignored by parser
    def start_fieldset(self,attrs): pass # tag ignored by parser
    def end_fieldset(self): pass # tag ignored by parser
    def start_legend(self,attrs): pass # tag ignored by parser
    def end_legend(self): pass # tag ignored by parser
    def start_button(self,attrs): pass # tag ignored by parser
    def end_button(self): pass # tag ignored by parser
    

    # Tables
    def start_table(self,attrs): self.known_starttag(attrs)
    def end_table(self): self.known_endtag()
    def start_caption(self,attrs): self.known_starttag(attrs)
    def end_caption(self): self.known_endtag('Table caption')
    def start_thead(self,attrs): pass # tag ignored by parser
    def end_thead(self): pass # tag ignored by parser
    def start_tfoot(self,attrs): pass # tag ignored by parser
    def end_tfoot(self): pass # tag ignored by parser
    def start_tbody(self,attrs): pass # tag ignored by parser
    def end_tbody(self): pass # tag ignored by parser
    def start_colgroup(self,attrs): pass # tag ignored by parser
    def end_colgroup(self): pass # tag ignored by parser
    def start_col(self,attrs): pass # tag ignored by parser
    def end_col(self): pass # do_col
    def start_tr(self,attrs): self.known_starttag(attrs)
    def end_tr(self): self.known_endtag()
    def start_th(self,attrs): self.known_starttag(attrs)
    def end_th(self): self.known_endtag()
    def start_td(self,attrs): self.known_starttag(attrs)
    def end_td(self): self.known_endtag()


class testXHTMLParser(XHTMLParser):

    def reset(self):                       
        # extend (called by XMLParser.__init__)
        self.title = "" # titre du document XHTML
        self.link = []  # liste des liens
        self.image = [] # liste des images

        # liste des attributs à récupérer
        self._imageattrs = ['src', 'alt', 'title'] # attributs des images
        self._linkattrs = ['href','title','name'] # attributs des liens
        #
        XHTMLParser.reset(self)

    # title
    def end_title(self):
        self.title = self.known_endtag('Document title')

    # Images
    def start_img(self, attrs): # mise a jour de la liste des images
        self.known_dotag(attrs)
        imgattrs = {}
        for key in self._imageattrs:
            imgattrs[key]=''
        for key, value in attrs.items():
            key = key.split()[-1] # enleve 'namespaces'
            if key in self._imageattrs:
                imgattrs[key]=value
        self.image.append(imgattrs)
    def end_img(self): pass # do_img

    # Anchor Element
    def start_a(self,attrs): # mise a jour de la liste des liens
        self.known_starttag(attrs)
        self.__a_attrs = {}
        for key in self._linkattrs:
            self.__a_attrs[key]=''
        for key, value in attrs.items():
            key = key.split()[-1] # enleve 'namespaces'
            if key in self._linkattrs:
                self.__a_attrs[key]=value
    def end_a(self): # mise a jour de la liste des liens
        data = self.known_endtag('LINK')
        self.__a_attrs['link'] = data
        self.link.append(self.__a_attrs)


def main(args = None):
    """ Test de la Class XHTMLParser

        -w affiche les warnings
        -v affiche les autres messages
        -s mode silencieux"""
    import sys, getopt

    verbose = warning = 0

    if not args:
        args = sys.argv[1:]

    opts, args = getopt.getopt(args, 'wvs')
    for o, a in opts:
        if o == '-w':
            warning = 1
        elif o == '-v':
            verbose = 1
        elif o == '-s':
            verbose = 0
            warning = 0

    if args:
        file = args[0]
    else:
        file = 'test.html'

    if file == '-':
        f = sys.stdin
    else:
        try:
            f = open(file, 'r')
        except IOError, msg:
            print file, ":", msg
            sys.exit(1)

    data = f.read()
    if f is not sys.stdin:
        f.close()

    x = testXHTMLParser(verbose=verbose, warning=warning)
    for c in data:
        x.feed(c)
    x.close()
    print "\nTitre du document XHTML:", x.title
    print "\nliste des liens:"
    pprint.pprint( x.link )
    print "\nliste des images:"
    pprint.pprint( x.image )

if __name__ == '__main__':
    main()