XML SAX Parsing — SAX Events, Handlers, Streaming, Memory Efficiency, Python/Java Examples
SAX (Simple API for XML) is an event-driven streaming parser that processes XML without loading the entire document into memory. This makes it ideal for large XML files where DOM would consume too much memory. This guide covers SAX architecture, handlers, and practical implementations.
What You’ll Learn
You’ll understand SAX’s event-driven model (startElement, characters, endElement), implement ContentHandler callbacks in Java and Python, stream-process large XML files (100MB+), and choose between SAX, DOM, and StAX for different scenarios.
Learning Path
flowchart LR
A[DOM Parsing] --> B[SAX Parsing<br/>You are here]
B --> C[StAX Parsing]
C --> D[XPath with Namespaces]
style B fill:#f90,color:#fff
SAX Event Model
SAX reports events as the parser reads the XML document:
| SAX Event | When Fired | Example |
|---|---|---|
| startDocument | Start of XML document | |
| startElement | Opening tag | <book category="fiction"> |
| characters | Text content | “The DOM Guide” |
| endElement | Closing tag | </book> |
| endDocument | End of XML document | |
| warning/error | Parse issues | Malformed XML |
Java SAX Parsing
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
public class SaxBookHandler extends DefaultHandler {
private StringBuilder currentValue = new StringBuilder();
private int bookCount = 0;
private String currentCategory;
private String currentTitle;
private String currentAuthor;
private double currentPrice;
private String currentElement;
@Override
public void startDocument() {
System.out.println("Starting XML parsing...");
}
@Override
public void startElement(String uri, String localName,
String qName, Attributes attributes) {
currentValue.setLength(0);
currentElement = qName;
if (qName.equals("book")) {
currentCategory = attributes.getValue("category");
currentTitle = null;
currentAuthor = null;
currentPrice = 0;
}
}
@Override
public void characters(char[] ch, int start, int length) {
currentValue.append(ch, start, length);
}
@Override
public void endElement(String uri, String localName, String qName) {
switch (qName) {
case "title":
currentTitle = currentValue.toString().trim();
break;
case "author":
currentAuthor = currentValue.toString().trim();
break;
case "price":
currentPrice = Double.parseDouble(
currentValue.toString().trim());
break;
case "book":
bookCount++;
System.out.printf("Book %d: [%s] %s by %s — $%.2f%n",
bookCount, currentCategory, currentTitle,
currentAuthor, currentPrice);
break;
}
}
@Override
public void endDocument() {
System.out.println("Parsing complete. Found " + bookCount + " books.");
}
public static void main(String[] args) throws Exception {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
SaxBookHandler handler = new SaxBookHandler();
parser.parse("books.xml", handler);
}
}Expected output:
Starting XML parsing...
Book 1: [fiction] The DOM Guide by Alice Smith — $29.99
Book 2: [non-fiction] XML for Beginners by Bob Jones — $39.99
Parsing complete. Found 2 books.Python SAX Parsing
import xml.sax
class BookHandler(xml.sax.ContentHandler):
def __init__(self):
self.current_value = ''
self.current_element = ''
self.books = []
self.current_book = {}
def startElement(self, name, attrs):
self.current_value = ''
self.current_element = name
if name == 'book':
self.current_book = {
'category': attrs.get('category', ''),
'title': '',
'author': '',
'price': 0.0,
}
def characters(self, content):
self.current_value += content
def endElement(self, name):
value = self.current_value.strip()
if name == 'title':
self.current_book['title'] = value
elif name == 'author':
self.current_book['author'] = value
elif name == 'price':
self.current_book['price'] = float(value)
elif name == 'book':
self.books.append(self.current_book)
self.print_book(self.current_book)
def print_book(self, book):
print(f"Book: [{book['category']}] {book['title']} "
f"by {book['author']} — ${book['price']:.2f}")
def endDocument(self):
print(f"Parsing complete. Found {len(self.books)} books.")
# Usage
parser = xml.sax.make_parser()
handler = BookHandler()
parser.setContentHandler(handler)
parser.parse('books.xml')Streaming Large XML Files
Java — Processing 1GB XML
public class LargeXmlProcessor {
private static final long MAX_RECORDS = 1000000;
public void processLargeFile(String filename) throws Exception {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
// Use buffered input for large files
try (InputStream input = new BufferedInputStream(
new FileInputStream(filename), 1024 * 1024)) {
parser.parse(input, new LargeFileHandler());
}
}
class LargeFileHandler extends DefaultHandler {
private long recordCount = 0;
private long skippedRecords = 0;
private StringBuilder currentValue = new StringBuilder();
private boolean inTargetElement = false;
@Override
public void startElement(String uri, String localName,
String qName, Attributes attrs) {
currentValue.setLength(0);
if (qName.equals("transaction")) {
recordCount++;
if (recordCount > MAX_RECORDS) {
// Skip processing, just count
inTargetElement = false;
} else {
inTargetElement = true;
}
} else {
inTargetElement = qName.equals("transaction");
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (inTargetElement) {
currentValue.append(ch, start, length);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (!inTargetElement) return;
// Process element...
if (qName.equals("transaction")) {
recordCount++;
if (recordCount % 100000 == 0) {
System.out.printf(
"Processed %d records...%n", recordCount);
}
}
}
@Override
public void endDocument() {
System.out.printf(
"File complete. %d records processed.%n", recordCount);
}
}
}Python Streaming with Iterative Parsing
import xml.sax
from collections import Counter
class StreamingStatsHandler(xml.sax.ContentHandler):
"""Process large XML without building full structure"""
def __init__(self):
self.stats = Counter()
self.current_element = ''
self.current_value = ''
self.depth = 0
def startElement(self, name, attrs):
self.current_value = ''
self.current_element = name
self.depth += 1
# Track element frequency
self.stats[f'element_{name}'] += 1
def characters(self, content):
self.current_value += content
def endElement(self, name):
self.depth -= 1
if self.depth == 0:
return
# Collect statistics without storing all data
text = self.current_value.strip()
if text:
# Track text length distribution
length_range = (len(text) // 10) * 10
self.stats[f'length_{length_range}-{length_range+10}'] += 1
def endDocument(self):
print("=== Document Statistics ===")
for key, count in self.stats.most_common(20):
print(f"{key}: {count}")SAX vs StAX vs DOM
| Feature | SAX | StAX | DOM |
|---|---|---|---|
| Model | Push (events) | Pull (cursor) | Tree |
| Memory | Very low | Very low | High |
| Speed | Fast | Fast | Moderate |
| Read/Write | Read only | Read + Write | Read + Write |
| Random access | No | No | Yes |
| XML modification | No | Limited | Full |
| API complexity | Medium | Low | Medium |
When to Use Each
public class ParserSelector {
public enum ParserType { DOM, SAX, STAX }
public static ParserType recommend(long fileSize, boolean needWrite,
boolean needRandomAccess) {
if (fileSize < 10 * 1024 * 1024 && (needWrite || needRandomAccess)) {
return ParserType.DOM;
}
if (needWrite) {
return ParserType.STAX;
}
return ParserType.SAX;
}
}Common SAX Parsing Mistakes
1. Characters Method Called Multiple Times
The characters() method can be called multiple times for a single text node (especially with large text). Always append to a StringBuilder rather than assuming you get all text at once.
2. Not Trimming Whitespace
XML whitespace between elements generates characters() calls with just whitespace. Always .trim() or .strip() text content. Use ignorableWhitespace() for element-only content.
3. State Machine Mismanagement
SAX requires tracking state manually. Use a stack or flags to track which element you’re currently inside. For nested elements with the same name, a stack is essential.
4. Thread Safety
SAX handlers are not thread-safe. Each parsing thread needs its own handler instance. Don’t share handler state across threads.
5. Error Handling
SAX parsing errors (malformed XML, encoding issues) throw SAXException. Always implement error(), fatalError(), and warning() callbacks for production parsing.
6. Not Using Buffered Streams
Large files without buffering are slow. Wrap FileInputStream in BufferedInputStream (buffer size 1MB for large files). This improves throughput 10x.
7. Forgetting Namespace Processing
Enable namespace processing for namespace-aware XML:
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);Practice Questions
1. How does SAX differ from DOM in memory usage? DOM loads the entire XML tree into memory (5-10x file size). SAX streams the XML and reports events — it only stores the current parsing state. A 1GB file takes 5-10GB with DOM, < 10MB with SAX.
2. What is the push vs pull model difference between SAX and StAX?
SAX pushes events to the handler (you react to startElement/endElement). StAX pulls events from the parser (you call next() to advance). StAX gives you control over parsing speed.
3. Why might characters() be called multiple times for one text element?
The parser may split text across multiple characters() calls for large text nodes or when text spans internal buffers. Always accumulate text in a StringBuilder.
4. How do you track which element you’re currently inside with SAX?
Use a Stack or Deque: push element names in startElement, pop in endElement. The top of the stack is the current element. This handles nested elements correctly.
5. Challenge: You have a 2GB XML file with 50 million <record> elements. Each record has 10 fields. You need to count records where status="ERROR". How would you implement this with SAX?
Answer: Create a ContentHandler. In startElement, check if element is record. If so, read status attribute. Track a flag inErrorRecord. In endElement, when record ends, increment counter if flagged. Total memory: < 1MB.
Mini Project: XML SAX Inspector
Create a utility that inspects XML structure without loading the whole file:
import xml.sax
import sys
from collections import defaultdict
class XmlInspector(xml.sax.ContentHandler):
"""Analyze XML structure and statistics streaming"""
def __init__(self):
self.element_counts = defaultdict(int)
self.attribute_counts = defaultdict(int)
self.max_depth = 0
self.current_depth = 0
self.element_paths = set()
self.path = []
def startElement(self, name, attrs):
self.current_depth += 1
self.max_depth = max(self.max_depth, self.current_depth)
self.path.append(name)
# Track element
self.element_counts[name] += 1
# Track path
path_str = '/'.join(self.path)
self.element_paths.add(path_str)
# Track attributes
for attr_name in attrs.getNames():
key = f"{name}@{attr_name}"
self.attribute_counts[key] += 1
def endElement(self, name):
self.current_depth -= 1
self.path.pop()
def report(self):
print("=== XML Structure Report ===")
print(f"Max nesting depth: {self.max_depth}")
print(f"Unique element paths: {len(self.element_paths)}")
print()
print("--- Element Frequency ---")
for element, count in sorted(
self.element_counts.items(),
key=lambda x: x[1], reverse=True
)[:20]:
print(f" <{element}>: {count}")
print()
print("--- Attribute Coverage ---")
for attr, count in sorted(
self.attribute_counts.items(),
key=lambda x: x[1], reverse=True
)[:10]:
print(f" {attr}: {count}")
print()
print("--- Full Paths ---")
for path in sorted(self.element_paths)[:20]:
print(f" /{path}")
# Usage: python xml_inspector.py large_file.xml
if __name__ == '__main__':
parser = xml.sax.make_parser()
handler = XmlInspector()
parser.setContentHandler(handler)
parser.parse(sys.argv[1])
handler.report()FAQ
What’s Next
Built by the developers of Doda Browser, DodaZIP, and Durga Antivirus Pro. Updated 2026-06-20.
Built by the developers of DodaTech
Doda Browser, DodaZIP & Durga Antivirus Pro