Build a File Converter with Python (Step by Step)
Build a file converter with Python featuring a CLI tool and web interface that converts CSV to JSON, JSON to CSV, Markdown to HTML, HTML to Markdown, and resizes/converts images between formats using Pillow — with file upload, conversion, and download.
What You’ll Build
You’ll build a dual-interface file conversion tool: a CLI for scripting/batch conversions and a Flask web app for interactive use. Upload a CSV, get JSON back. Drop in a Markdown file, get rendered HTML. Resize images, convert PNG to JPEG, apply filters. This same conversion engine powers DodaZIP’s built-in file converter.
Why Build a File Converter?
File conversion is a universal need — data migration, content publishing, image optimization. Building one teaches you data serialization, format parsing edge cases, handling encoding differences, error handling for malformed files, and serving converted files for download. The patterns apply to ETL pipelines, CMS importers, and media processing services.
Prerequisites
- Python 3.10+ installed
- Basic Flask or FastAPI knowledge
- Understanding of JSON and CSV formats
- pip package manager
Step 1: Project Setup
mkdir file-converter
cd file-converter
python -m venv venv
source venv/bin/activate
pip install flask pillow markdown bleach python-dotenvStep 2: Core Conversion Engine
# converter.py
import csv
import json
import io
import markdown
import bleach
from PIL import Image
from pathlib import Path
from typing import Union, Tuple, Optional
class ConversionError(Exception):
pass
def csv_to_json(csv_content: str, delimiter: str = ",") -> str:
"""Convert CSV string to JSON string."""
try:
reader = csv.DictReader(io.StringIO(csv_content), delimiter=delimiter)
rows = list(reader)
if not rows:
raise ConversionError("CSV file is empty or has no data rows")
return json.dumps(rows, indent=2)
except csv.Error as e:
raise ConversionError(f"Invalid CSV format: {e}")
def json_to_csv(json_content: str) -> str:
"""Convert JSON string to CSV string."""
try:
data = json.loads(json_content)
if isinstance(data, dict):
data = [data]
if not isinstance(data, list) or not data:
raise ConversionError("JSON must be an array of objects or a single object")
output = io.StringIO()
fieldnames = set()
for row in data:
fieldnames.update(row.keys())
writer = csv.DictWriter(output, fieldnames=sorted(fieldnames))
writer.writeheader()
for row in data:
writer.writerow({k: str(v) if v is not None else "" for k, v in row.items()})
return output.getvalue()
except json.JSONDecodeError as e:
raise ConversionError(f"Invalid JSON format: {e}")
def markdown_to_html(md_content: str) -> str:
"""Convert Markdown string to sanitized HTML string."""
allowed_tags = [
"h1", "h2", "h3", "h4", "h5", "h6", "p", "br", "hr",
"ul", "ol", "li", "dl", "dt", "dd",
"strong", "em", "b", "i", "u", "s", "del", "ins",
"a", "img", "code", "pre", "blockquote",
"table", "thead", "tbody", "tr", "th", "td",
"div", "span", "section", "header", "footer",
]
allowed_attrs = {
"a": ["href", "title", "target"],
"img": ["src", "alt", "title", "width", "height"],
"*": ["class", "id"],
}
try:
html = markdown.markdown(
md_content,
extensions=["fenced_code", "tables", "nl2br", "codehilite"],
)
sanitized = bleach.clean(html, tags=allowed_tags, attributes=allowed_attrs, strip=True)
return sanitized
except Exception as e:
raise ConversionError(f"Markdown conversion failed: {e}")
def html_to_markdown(html_content: str) -> str:
"""Convert basic HTML string to Markdown string (simplified)."""
from html.parser import HTMLParser
class HTMLToMarkdown(HTMLParser):
def __init__(self):
super().__init__()
self.result = []
self.in_list = False
self.in_code = False
self.code_buffer = []
def handle_starttag(self, tag, attrs):
tag_map = {
"h1": "# ", "h2": "## ", "h3": "### ",
"h4": "#### ", "h5": "##### ", "h6": "###### ",
"strong": "**", "b": "**", "em": "*", "i": "*",
"code": "`", "blockquote": "> ",
"li": "- " if not self.in_list else " - ",
"br": "\n",
"hr": "\n---\n",
}
if tag == "pre":
self.in_code = True
self.code_buffer = []
elif tag == "ul" or tag == "ol":
self.in_list = True
elif tag in tag_map:
self.result.append(tag_map[tag])
def handle_endtag(self, tag):
if tag == "pre":
self.in_code = False
self.result.append(f"```\n{''.join(self.code_buffer)}\n```\n")
elif tag == "ul" or tag == "ol":
self.in_list = False
elif tag in ["strong", "b", "em", "i", "code"]:
self.result.append(tag_map_reverse.get(tag, ""))
elif tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "blockquote"]:
self.result.append("\n")
def handle_data(self, data):
if self.in_code:
self.code_buffer.append(data)
else:
self.result.append(data)
tag_map_reverse = {"strong": "**", "b": "**", "em": "*", "i": "*", "code": "`"}
parser = HTMLToMarkdown()
parser.feed(html_content)
return "".join(parser.result).strip()
def resize_image(
image_data: bytes,
width: Optional[int] = None,
height: Optional[int] = None,
output_format: str = "JPEG",
maintain_aspect: bool = True,
) -> Tuple[bytes, str]:
"""Resize image bytes, return (new_bytes, mime_type)."""
try:
img = Image.open(io.BytesIO(image_data))
original_format = img.format or "JPEG"
if width or height:
if maintain_aspect:
img.thumbnail((width or img.width, height or img.height), Image.LANCZOS)
else:
img = img.resize((width or img.width, height or img.height), Image.LANCZOS)
format_map = {
"JPEG": "image/jpeg",
"PNG": "image/png",
"WEBP": "image/webp",
"GIF": "image/gif",
}
mime = format_map.get(output_format.upper(), "image/jpeg")
output = io.BytesIO()
if output_format.upper() == "JPEG" and img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(output, format=output_format.upper())
return output.getvalue(), mime
except Exception as e:
raise ConversionError(f"Image conversion failed: {e}")
def get_supported_conversions():
return {
"csv_to_json": {"from": ".csv", "to": ".json", "name": "CSV to JSON"},
"json_to_csv": {"from": ".json", "to": ".csv", "name": "JSON to CSV"},
"md_to_html": {"from": ".md", "to": ".html", "name": "Markdown to HTML"},
"html_to_md": {"from": ".html", "to": ".md", "name": "HTML to Markdown"},
"image_resize": {"from": ".jpg,.png,.webp,.gif", "to": ".jpg,.png,.webp", "name": "Image Resize/Convert"},
}
def convert_file(content: bytes, conversion_type: str, **kwargs) -> Tuple[Union[str, bytes], str]:
"""Main dispatch function."""
conversions = {
"csv_to_json": lambda c: (csv_to_json(c.decode("utf-8")), "application/json"),
"json_to_csv": lambda c: (json_to_csv(c.decode("utf-8")), "text/csv"),
"md_to_html": lambda c: (markdown_to_html(c.decode("utf-8")), "text/html"),
"html_to_md": lambda c: (html_to_markdown(c.decode("utf-8")), "text/markdown"),
}
if conversion_type in conversions:
result, mime = conversions[conversion_type](content)
return result, mime
elif conversion_type == "image_resize":
width = kwargs.get("width")
height = kwargs.get("height")
output_format = kwargs.get("output_format", "JPEG")
result, mime = resize_image(content, width, height, output_format)
return result, mime
else:
raise ConversionError(f"Unknown conversion type: {conversion_type}")Step 3: CLI Interface
# cli.py
#!/usr/bin/env python3
import argparse
import sys
from pathlib import Path
from converter import convert_file, get_supported_conversions, ConversionError
def main():
parser = argparse.ArgumentParser(description="File Converter CLI")
parser.add_argument("input", help="Input file path")
parser.add_argument("conversion", help="Conversion type", choices=get_supported_conversions().keys())
parser.add_argument("-o", "--output", help="Output file path (default: auto)")
parser.add_argument("--width", type=int, help="Resize width (image only)")
parser.add_argument("--height", type=int, help="Resize height (image only)")
parser.add_argument("--format", help="Output format (image only: JPEG, PNG, WEBP)")
args = parser.parse_args()
input_path = Path(args.input)
if not input_path.exists():
print(f"Error: File not found: {args.input}", file=sys.stderr)
sys.exit(1)
kwargs = {}
if args.width:
kwargs["width"] = args.width
if args.height:
kwargs["height"] = args.height
if args.format:
kwargs["output_format"] = args.format
try:
content = input_path.read_bytes()
result, mime = convert_file(content, args.conversion, **kwargs)
if args.output:
output_path = Path(args.output)
if isinstance(result, str):
output_path.write_text(result)
else:
output_path.write_bytes(result)
print(f"Converted: {args.input} → {output_path}")
else:
if isinstance(result, str):
print(result)
else:
output_path = input_path.with_suffix(".converted")
output_path.write_bytes(result)
print(f"Converted: {args.input} → {output_path}")
except ConversionError as e:
print(f"Conversion error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()Step 4: Web Interface
# app.py
from flask import Flask, request, render_template, send_file, flash, redirect, url_for
from converter import convert_file, get_supported_conversions, ConversionError
import tempfile
import os
app = Flask(__name__)
app.secret_key = os.getenv("SECRET_KEY", "dev-secret-key")
app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 # 16MB
ALLOWED_EXTENSIONS = {"csv", "json", "md", "html", "png", "jpg", "jpeg", "webp", "gif"}
def allowed_file(filename):
return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route("/")
def index():
return render_template("index.html", conversions=get_supported_conversions())
@app.route("/convert", methods=["POST"])
def convert():
if "file" not in request.files:
flash("No file selected")
return redirect(url_for("index"))
file = request.files["file"]
if file.filename == "":
flash("No file selected")
return redirect(url_for("index"))
if not allowed_file(file.filename):
flash(f"File type not allowed. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
return redirect(url_for("index"))
conversion_type = request.form.get("conversion_type")
if not conversion_type:
flash("Please select a conversion type")
return redirect(url_for("index"))
kwargs = {}
if conversion_type == "image_resize":
try:
width = request.form.get("width", type=int)
height = request.form.get("height", type=int)
output_format = request.form.get("output_format", "JPEG")
if width:
kwargs["width"] = width
if height:
kwargs["height"] = height
kwargs["output_format"] = output_format
except (ValueError, TypeError):
flash("Invalid image dimensions")
return redirect(url_for("index"))
try:
content = file.read()
result, mime = convert_file(content, conversion_type, **kwargs)
# Save to temp file for download
suffix = {"application/json": ".json", "text/csv": ".csv", "text/html": ".html",
"text/markdown": ".md", "image/jpeg": ".jpg", "image/png": ".png",
"image/webp": ".webp"}.get(mime, ".out")
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
if isinstance(result, str):
tmp.write(result.encode("utf-8"))
else:
tmp.write(result)
tmp.close()
original_name = file.filename.rsplit(".", 1)[0]
download_name = f"{original_name}_converted{suffix}"
return send_file(tmp.name, mimetype=mime, as_attachment=True, download_name=download_name)
except ConversionError as e:
flash(f"Conversion error: {e}")
return redirect(url_for("index"))
except Exception as e:
flash(f"Unexpected error: {e}")
return redirect(url_for("index"))
if __name__ == "__main__":
app.run(debug=True, port=5000)<!-- templates/index.html -->
<!DOCTYPE html>
<html>
<head>
<title>File Converter</title>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body { font-family: system-ui, sans-serif; background: #f0f4f8; min-height: 100vh; padding: 40px 20px; }
.container { max-width: 600px; margin: 0 auto; }
h1 { text-align: center; margin-bottom: 8px; color: #1a202c; }
.subtitle { text-align: center; color: #718096; margin-bottom: 32px; }
.card { background: white; border-radius: 16px; padding: 32px; box-shadow: 0 4px 16px rgba(0,0,0,0.06); }
.flash { background: #fed7d7; color: #c53030; padding: 12px 16px; border-radius: 8px; margin-bottom: 20px; }
.form-group { margin-bottom: 20px; }
label { display: block; margin-bottom: 6px; font-weight: 600; color: #2d3748; }
select, input[type="file"] { width: 100%; padding: 10px 12px; border: 2px solid #e2e8f0; border-radius: 8px; font-size: 14px; }
select:focus { border-color: #4299e1; outline: none; }
.image-options { display: none; background: #f7fafc; padding: 16px; border-radius: 8px; margin-top: 12px; }
.image-options.show { display: block; }
.image-options .row { display: flex; gap: 12px; margin-bottom: 12px; }
.image-options input, .image-options select { flex: 1; }
.btn { width: 100%; padding: 14px; background: #4299e1; color: white; border: none; border-radius: 8px; font-size: 16px; font-weight: 600; cursor: pointer; }
.btn:hover { background: #3182ce; }
.info { margin-top: 24px; padding: 16px; background: #ebf8ff; border-radius: 8px; color: #2b6cb0; font-size: 14px; }
.info code { background: #bee3f8; padding: 2px 6px; border-radius: 4px; }
</style>
</head>
<body>
<div class="container">
<h1>File Converter</h1>
<p class="subtitle">Convert CSV ↔ JSON, Markdown ↔ HTML, Resize Images</p>
<div class="card">
{% with messages = get_flashed_messages() %}
{% if messages %}
{% for message in messages %}
<div class="flash">{{ message }}</div>
{% endfor %}
{% endif %}
{% endwith %}
<form method="POST" action="/convert" enctype="multipart/form-data">
<div class="form-group">
<label for="conversion_type">Conversion Type</label>
<select name="conversion_type" id="conversion_type" required>
<option value="">Select conversion...</option>
{% for key, conv in conversions.items() %}
<option value="{{ key }}">{{ conv.name }}</option>
{% endfor %}
</select>
</div>
<div class="form-group">
<label for="file">Choose File</label>
<input type="file" name="file" id="file" required>
</div>
<div class="image-options" id="imageOptions">
<div class="row">
<input type="number" name="width" placeholder="Width (px)" min="1">
<input type="number" name="height" placeholder="Height (px)" min="1">
</div>
<div class="row">
<select name="output_format">
<option value="JPEG">JPEG</option>
<option value="PNG">PNG</option>
<option value="WEBP">WEBP</option>
</select>
</div>
</div>
<button type="submit" class="btn">Convert & Download</button>
</form>
<div class="info">
<strong>CLI Usage:</strong>
<code>python cli.py input.csv csv_to_json -o output.json</code><br>
<code>python cli.py input.md md_to_html -o output.html</code><br>
<code>python cli.py input.jpg image_resize --width 800 --format PNG</code>
</div>
</div>
</div>
<script>
document.getElementById("conversion_type").addEventListener("change", function() {
const options = document.getElementById("imageOptions");
options.classList.toggle("show", this.value === "image_resize");
});
</script>
</body>
</html>Step 5: Run
python app.pyExpected output:
* Serving Flask app 'app'
* Debug mode: on
* Running on http://127.0.0.1:5000Create a test CSV and convert it:
echo "name,age,city\nAlice,30,NYC\nBob,25,LA" > test.csv
python cli.py test.csv csv_to_jsonExpected CLI output:
[
{
"age": "30",
"city": "NYC",
"name": "Alice"
},
{
"age": "25",
"city": "LA",
"name": "Bob"
}
]Architecture
flowchart LR
A[User] --> B{Interface}
B --> C[CLI: python cli.py]
B --> D[Web: Flask browser]
C --> E{Conversion Engine}
D --> E
E --> F[CSV ↔ JSON]
E --> G[MD ↔ HTML]
E --> H[Image Resize/Convert]
F --> I[Download/Output]
G --> I
H --> I
Common Errors
1. “File type not allowed” for a valid CSV file
The extension check is case-sensitive on some systems. Ensure the file extension is lowercase (.csv not .CSV). The allowed_file() function converts to lowercase for comparison. If your file has no extension or a double extension (.csv.json), it will be rejected.
2. CSV to JSON produces empty array
The CSV must have a header row (first row = column names). csv.DictReader uses the first row as dictionary keys. Without headers, every row becomes an empty dict {}. Add a header row to your CSV file.
3. Markdown images not displaying in HTML output
The bleach library strips <img> tags by default for security. We’ve added “img” to the allowed_tags list and “src”, “alt”, “title” to allowed_attrs. If images still don’t appear, check that the Markdown source uses the correct syntax: .
4. Image conversion fails with “cannot write mode RGBA as JPEG”
JPEG doesn’t support alpha channel (transparency). When converting a PNG with transparency to JPEG, the RGBA mode must be converted to RGB. Our resize_image function handles this with img.convert("RGB") for JPEG output.
Practice Questions
1. Why use csv.DictReader instead of csv.reader?
csv.DictReader maps CSV columns to dictionary keys using the header row. This produces structured data that converts naturally to JSON (array of objects). csv.reader returns raw row lists with no column mapping — you’d need to manually handle headers and indexing.
2. How does bleach.clean() improve security?
Without sanitization, a Markdown file containing <script>alert('xss')</script> would render as executable JavaScript in the browser. bleach.clean() strips dangerous tags and attributes while preserving safe HTML, preventing XSS attacks when displaying converted HTML.
3. Why use temporary files for the download instead of in-memory streams?
Flask’s send_file() needs a file path or a file-like object. Temp files work across all operating systems and handle large files without memory issues. The temp file is deleted by the OS after the response is sent. For production, use an in-memory BytesIO with send_file(..., as_attachment=True, download_name=...).
4. Challenge: Add batch conversion
Add CLI support for converting all files in a directory with python cli.py input_dir/*.csv csv_to_json. Use glob to match files. Add a progress bar with tqdm. Process files concurrently with ThreadPoolExecutor.
5. Challenge: Add PDF-to-text conversion
Integrate pdfplumber or PyMuPDF to extract text from PDF files. Add a new conversion type pdf_to_text. Handle multi-page PDFs, tables, and embedded images. The text output should preserve basic formatting with line breaks and spacing.
FAQ
Next Steps
- Add Docker containerization for deployment
- Explore the File Upload Service project for advanced upload handling
- Learn Python generators for memory-efficient streaming
- Build the Image Processing Service project for advanced image manipulation
Built by the developers of DodaTech
Doda Browser, DodaZIP & Durga Antivirus Pro