← All articles

Batch-verify a document archive with Python

April 24, 2026·python · tutorial · blockchain · verification

You've got a folder with 500 legal documents. Half have blockchain timestamp proofs, half don't. Discovery deadline is next week. How do you quickly sort the verified from the unverified without checking each file manually? ProofAnchor (proofanchor.com) is a service that anchors SHA-256 hashes of files to the Polygon blockchain, creating immutable proof files alongside your documents. Here's how to batch-verify an entire archive in Python.

The discovery problem

Legal teams and insurance adjusters face this constantly. A client submits a USB drive with "all relevant documents from 2024." Some files have .proof.json companions (blockchain timestamps), others don't. Which ones can you authenticate in court? Which need additional verification steps?

The manual approach doesn't scale. Open file, check for proof file, load proof, verify hash, repeat 500 times. That's hours of tedious work prone to human error.

Python can automate the entire workflow. Install the verification package:

pip install verify-proof

The verify-proof package handles the cryptographic verification behind the scenes. Point it at a file and its proof, get back a verified/unverified result with blockchain details.

Batch verification script

Here's a script that walks a directory tree, pairs each document with its proof file, and outputs a CSV summary:

import os
import json
import csv
from pathlib import Path
from verify_proof import hash_file, verify_proof

def batch_verify_archive(archive_path, output_csv):
    results = []
    
    for root, dirs, files in os.walk(archive_path):
        for file in files:
            # Skip proof files themselves
            if file.endswith('.proof.json'):
                continue
                
            file_path = os.path.join(root, file)
            proof_path = file_path + '.proof.json'
            
            result = {
                'file_path': file_path,
                'file_size': os.path.getsize(file_path),
                'has_proof': os.path.exists(proof_path),
                'verified': False,
                'blockchain': None,
                'anchored_at': None,
                'error': None
            }
            
            if result['has_proof']:
                try:
                    with open(proof_path, 'r') as f:
                        proof_data = json.load(f)
                    
                    # Hash the file and verify against proof
                    file_hash = hash_file(file_path)
                    verification = verify_proof(file_hash, proof_data)
                    
                    result['verified'] = verification['verified']
                    result['blockchain'] = verification.get('blockchain')
                    result['anchored_at'] = verification.get('anchored_at')
                    
                except Exception as e:
                    result['error'] = str(e)
            
            results.append(result)
    
    # Write CSV report
    with open(output_csv, 'w', newline='') as csvfile:
        fieldnames = ['file_path', 'file_size', 'has_proof', 'verified', 
                     'blockchain', 'anchored_at', 'error']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    
    return results

if __name__ == "__main__":
    results = batch_verify_archive('./documents', 'verification_report.csv')
    
    total_files = len(results)
    with_proofs = sum(1 for r in results if r['has_proof'])
    verified = sum(1 for r in results if r['verified'])
    
    print(f"Processed {total_files} files")
    print(f"Found proofs for {with_proofs} files") 
    print(f"Successfully verified {verified} proofs")

The script expects proof files to follow the naming convention document.pdf.proof.json alongside document.pdf. Each proof file contains the blockchain transaction details and cryptographic proof that links the file's SHA-256 hash to a specific timestamp.

Run it against any directory structure. The CSV output gives you filterable results you can sort by verification status, file size, or timestamp.

Enhanced reporting and error handling

Legal work demands detailed audit trails. Here's an extended version that captures more forensic details:

import os
import json
import csv
import hashlib
from datetime import datetime
from verify_proof import hash_file, verify_proof

def enhanced_batch_verify(archive_path, output_csv):
    results = []
    
    for root, dirs, files in os.walk(archive_path):
        for file in files:
            if file.endswith('.proof.json'):
                continue
                
            file_path = os.path.join(root, file)
            proof_path = file_path + '.proof.json'
            
            # Get file metadata
            stat = os.stat(file_path)
            
            result = {
                'file_path': os.path.relpath(file_path, archive_path),
                'file_size': stat.st_size,
                'modified_time': datetime.fromtimestamp(stat.st_mtime).isoformat(),
                'has_proof': os.path.exists(proof_path),
                'verified': False,
                'blockchain': None,
                'transaction_id': None,
                'anchored_at': None,
                'proof_service': None,
                'error': None
            }
            
            if result['has_proof']:
                try:
                    with open(proof_path, 'r') as f:
                        proof_data = json.load(f)
                    
                    file_hash = hash_file(file_path)
                    verification = verify_proof(file_hash, proof_data)
                    
                    result.update({
                        'verified': verification['verified'],
                        'blockchain': verification.get('blockchain'),
                        'transaction_id': verification.get('tx_id'),
                        'anchored_at': verification.get('anchored_at'),
                        'proof_service': verification.get('service')
                    })
                    
                except json.JSONDecodeError:
                    result['error'] = 'Invalid JSON in proof file'
                except FileNotFoundError:
                    result['error'] = 'Proof file not found'
                except Exception as e:
                    result['error'] = f'Verification failed: {str(e)}'
            
            results.append(result)
    
    # Sort by verification status, then by file path
    results.sort(key=lambda x: (not x['verified'], x['file_path']))
    
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=result.keys())
        writer.writeheader()
        writer.writerows(results)
    
    # Print summary statistics
    total = len(results)
    with_proofs = sum(1 for r in results if r['has_proof'])
    verified = sum(1 for r in results if r['verified'])
    errors = sum(1 for r in results if r['error'])
    
    print(f"\nVerification Summary:")
    print(f"Total files: {total}")
    print(f"With proof files: {with_proofs}")
    print(f"Successfully verified: {verified}")
    print(f"Verification errors: {errors}")
    
    if verified > 0:
        earliest = min((r['anchored_at'] for r in results if r['anchored_at']), default='N/A')
        latest = max((r['anchored_at'] for r in results if r['anchored_at']), default='N/A')
        print(f"Timestamp range: {earliest} to {latest}")
    
    return results

This version captures file modification times, sorts results by verification status, and provides a detailed summary. The CSV becomes a complete forensic report you can submit as part of your evidence package.

What's next

Batch verification scales to handle thousands of documents in minutes rather than hours. The CSV output integrates cleanly with legal case management systems or insurance claim databases. For more verification options and API details, check out the verify-proof package on PyPI or explore the source code at github.com/Fulcrum-Enterprises/verify-proof. The next tutorial in this series covers real-time folder monitoring for proof verification as new documents arrive.