Sender: johnw@aris.dynodns.net Newsgroups: gnu.emacs.sources Subject: xml-parse.el 1.2 From: John Wiegley Date: 20 Feb 2001 16:40:44 -0700 Message-ID: Organization: mail2news@nym.alias.net The version simplifies the core reader function, although I couldn't eke much more speed out. It can parse a 4000 line XML file in under one second, but the profiler won't give me sufficient information about core Lisp directives to know where the slowness is. Anyway, this version also supports non-nested . Have fun, John ---------------------------------------------------------------------- ;;; xml-parse --- code to efficiently read/write XML data with Elisp ;; Copyright (C) 2001 John Wiegley. ;; Author: John Wiegley ;; Version: 1.2 ;; Created: Feb 15, 2001 ;; Keywords: convenience languages lisp xml parse data ;; This file is NOT (yet) part of GNU Emacs. ;; This is free software; you can redistribute it and/or modify it ;; under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; This software is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;; General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. ;;; Commentary: ;; ;; XML is yet another way of expressing recursive, attributed data ;; structures -- something which Lisp has had the capacity to do for ;; decades. ;; ;; The approach taken by xml-parse.el is to read XML data into Lisp ;; structures, and allow those same Lisp structures to be written out ;; as XML. It should facilitate the manipulation and use of XML by ;; Elisp programs. ;; NOTE: This is not a validating parser, and makes no attempt to read ;; DTDs. See psgml.el if you need that kind of power. Also, it ;; cannot handle tags that begin with ;; ;; ;; My own book! ;; First ;; ;; ;; John ;; Wiegley ;; ;; ;; ;; ;; ;; A very small chapter ;; Wonder where the content is... ;; ;; ;; ;; It would be parsed into this Lisp structure: ;; ;; '(("book" ("id" . "compiler")) ;; ("bookinfo" ;; ("bookbiblio" ;; ("title" "My own book!") ;; ("edition" "FIrst") ;; ("authorgroup" ;; ("author" ;; ("firstname" "John") ;; ("surname" "Wiegley"))))) ;; ("chapter" ;; ("title" "A very small chapter") ;; ("para" "Wonder where the content is..."))) ;; ;; Now it can easily be modified and interpreted using ordinary Lisp ;; code, without the ordeal of manipulating textual XML. When you're ;; done modifying it, you can write it back out (complete with proper ;; indentation and newlines) using: ;; ;; (insert-xml t) ;; ;; See the documentation for `read-xml' and `insert-xml' for more ;; information. ;; ;; There are also a set of helper functions for accessing parts of a ;; parsed tag: ;; ;; xml-tag-name get the name of a tag ;; xml-tag-attrlist returns a tag's attribute alist ;; xml-tag-attr lookup a specific tag attribute ;; xml-tag-children returns a tag's child list ;; xml-tag-child lookup a specific child tag by name ;; ;; Also, the attribute list and child lists can be searched using ;; `assoc', since they roughly have the same format as an alist. ;;;###autoload (defun read-xml () "Parse XML data at point into a Lisp structure. See `insert-xml' for a description of the format of this structure. Point is left at the end of the XML structure read." (cdr (xml-parse-read))) (defsubst xml-tag-name (tag) "Return the name of an xml-parse'd XML TAG." (if (stringp (car tag)) (car tag) (caar tag))) (defsubst xml-tag-attrlist (tag) "Return the attribute list of an xml-parse'd XML TAG." (and (not (stringp (car tag))) (cdar tag))) (defsubst xml-tag-attr (tag attr) "Return a specific ATTR of an xml-parse'd XML TAG." (cdr (assoc attr (xml-tag-attrlist tag)))) (defsubst xml-tag-children (tag) "Return the list of child tags of an xml-parse'd XML TAG." (cdr tag)) (defun xml-tag-child (tag name) "Return the first child matching NAME, of an xml-parse'd XML TAG." (catch 'found (let ((children (xml-tag-children tag))) (while children (if (string= name (xml-tag-name (car children))) (throw 'found (car children))) (setq children (cdr children)))))) ;;;###autoload (defun insert-xml (data &optional add-newlines public system depth) "Insert DATA, a recursive Lisp structure, at point as XML. DATA has the form: ENTRY ::= (TAG CHILD*) CHILD ::= STRING | ENTRY TAG ::= TAG_NAME | (TAG_NAME ATTR+) ATTR ::= (ATTR_NAME . ATTR_VALUE) TAG_NAME ::= STRING ATTR_NAME ::= STRING ATTR_VALUE ::= STRING If ADD-NEWLINES is non-nil, newlines and indentation will be added to make the data user-friendly. If PUBLIC and SYSTEM are non-nil, a !DOCTYPE tag will be added at the top of the document to identify it as an XML document. DEPTH is normally for internal use only, and controls the depth of the indentation." (when (and (not depth) public system) (insert "\n") (insert "\n")) (if (stringp data) (insert data) (let ((node (car data)) add-nl) (and depth (bolp) (insert (make-string (* depth 2) ? ))) (if (stringp node) (insert "<" node) (setq node (caar data)) (insert "<" node) (let ((attrs (cdar data))) (while attrs (insert " " (caar attrs) "=\"" (cdar attrs) "\"") (setq attrs (cdr attrs))))) (if (null (cdr data)) (insert "/>") (setq data (cdr data)) (while data (and add-newlines (not (stringp (car data))) (insert ?\n)) (setq add-nl (insert-xml (car data) add-newlines nil nil (1+ (or depth 0))) data (cdr data))) (when add-nl (and add-newlines (insert ?\n)) (and depth (insert (make-string (* depth 2) ? )))) (insert "")) t))) ;;; Internal Functions (defun xml-parse-profile () (interactive) (let ((elp-function-list '(buffer-substring-no-properties char-after char-before forward-char looking-at match-string-no-properties match-beginning match-end point re-search-forward read-xml xml-parse-read search-forward string= stringp substring xml-parse-concat))) (elp-instrument-list))) (defsubst xml-parse-concat (beg end lst) "Add the string from BEG to END to LST, ignoring pure whitespace." (let ((text (buffer-substring-no-properties beg end))) (while (string-match "" text) (setq text (concat (substring text 0 beg) (substring text (match-end 0)))))) (let ((i 0) (l (length text)) non-ws) (while (< i l) (unless (memq (aref text i) '(?\n ?\t ? )) (setq i l non-ws t)) (setq i (1+ i))) (if non-ws (setcdr lst (list text)))))) (defun xml-parse-read (&optional inner-p) (let ((beg (search-forward "<" nil t)) after) (while (and beg (eq (setq after (char-after)) ?!) (looking-at "!--")) (search-forward "-->") (setq beg (search-forward "<" nil t))) (when beg (unless (re-search-forward "[ >]" nil t) (error "Pos %d: Unclosed tag" (point))) (cons (1- beg) (if (memq after '(?/ ?? ?!)) (if (eq after ?/) (buffer-substring-no-properties (1+ beg) (1- (point))) (error "Pos %d: Tags beginning with ? or ! are not supported" beg)) (let* ((single (eq (char-before (1- (point))) ?/)) (tag (buffer-substring-no-properties beg (- (point) (if single 2 1)))) attrs data-beg data) ;; handle the attribute list, if present (if (eq (char-before) ? ) (let* ((attrs (list t)) (lastattr attrs)) (while (not (memq (char-after) '(?/ ?>))) (unless (re-search-forward "\\([^=]+\\)=\"\\([^\"]+\\)\"\\s-*" nil t) (error "Pos %d: Invalid attribute list" (point))) (let ((attr (cons (match-string-no-properties 1) (match-string-no-properties 2)))) (setcdr lastattr (list attr)) (setq lastattr (cdr lastattr)))) (setq tag (cons tag (cdr attrs))) (forward-char 1))) ;; return the tag and its data (if single (list tag) (setq tag (list tag)) (let ((data-beg (point)) (tag-end (last tag))) (while (and (setq data (xml-parse-read t)) (not (stringp (cdr data)))) (if (xml-parse-concat data-beg (car data) tag-end) (setq tag-end (cdr tag-end))) (setq data-beg (point)) (setcdr tag-end (list (cdr data))) (setq tag-end (cdr tag-end))) (xml-parse-concat data-beg (car data) tag-end) tag)))))))) (provide 'xml-parse) ;;; xml-parse.el ends here