commit ca1f80d47072d32fc932f24783ed483716ce49f5 Author: Space-Banane <64922620+Space-Banane@users.noreply.github.com> Date: Fri Jan 16 21:30:17 2026 +0100 first commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..c89fb39 --- /dev/null +++ b/README.md @@ -0,0 +1,184 @@ +# Big File Generator + +A collection of Python CLI tools for creating and reading large binary files. Useful for testing disk I/O performance, storage systems, and file transfer mechanisms. + +## Tools + +### `make_big_file.py` - File Generator + +Creates large binary files filled with zeros for testing purposes. + +**Features:** +- Configurable file size with human-readable units (GB, TB, MB, etc.) +- Adjustable chunk size for write optimization +- Disk space validation before writing +- Real-time progress reporting with speed metrics +- Prevents accidental file overwrites +- Graceful interrupt handling with cleanup +- Quiet mode for scripting + +**Usage:** +```bash +python make_big_file.py [options] +``` + +**Arguments:** +- `output` - Output file path +- `size` - File size (e.g., 15GB, 1.5TB, 500MB) + +**Options:** +- `--chunk-size ` - Chunk size for writing (default: 64MB) +- `--quiet, -q` - Suppress progress output +- `--version` - Show version information +- `--help, -h` - Show help message + +**Examples:** +```bash +# Create a 15GB file +python make_big_file.py output.bin 15GB + +# Create a 1.5TB file with 128MB chunks +python make_big_file.py bigfile.dat 1.5TB --chunk-size 128MB + +# Create a 500MB file quietly +python make_big_file.py test.bin 500MB --quiet +``` + +### `read_big_file.py` - File Reader & Benchmark + +Reads large files and measures I/O performance, optionally computing checksums. + +**Features:** +- Configurable chunk size for read optimization +- Real-time progress reporting with speed metrics +- SHA256 hash computation option +- File validation before reading +- Quiet mode for scripting +- Graceful interrupt handling + +**Usage:** +```bash +python read_big_file.py [options] +``` + +**Arguments:** +- `input` - Input file path to read + +**Options:** +- `--chunk-size ` - Chunk size for reading (default: 64MB) +- `--hash` - Compute SHA256 hash of the file +- `--quiet, -q` - Suppress progress output +- `--version` - Show version information +- `--help, -h` - Show help message + +**Examples:** +```bash +# Read a large file +python read_big_file.py largefile.bin + +# Read with 128MB chunks and compute hash +python read_big_file.py test.dat --chunk-size 128MB --hash + +# Read quietly and compute hash +python read_big_file.py data.bin --hash --quiet +``` + +## Installation + +No external dependencies required. Works with Python 3.6+. + +```bash +# Clone or download the scripts +git clone +cd bigfilegen + +# Make scripts executable (optional, Unix/Linux/Mac) +chmod +x make_big_file.py read_big_file.py +``` + +## Requirements + +- Python 3.6 or higher +- Sufficient disk space for file creation +- Read/write permissions in target directories + +## Performance Tips + +### Chunk Size Optimization +- **SSDs**: Use larger chunks (64-128MB) for better performance +- **HDDs**: Use moderate chunks (32-64MB) to balance speed and memory +- **Network drives**: Experiment with different sizes based on network speed + +### File System Considerations +- **NTFS** (Windows): Supports files up to 16 EiB +- **exFAT**: Good for large files on external drives +- **ext4** (Linux): Supports files up to 16 TiB +- **APFS/HFS+** (macOS): Supports very large files + +## Use Cases + +- **Performance Testing**: Benchmark disk I/O speeds +- **Storage Validation**: Verify storage capacity and integrity +- **Transfer Testing**: Test file transfer mechanisms and speeds +- **Application Testing**: Test applications with large file handling +- **Disk Burn-in**: Stress test new storage devices + +## Output Examples + +### Creating a file: +``` +Creating file: test.bin +Target size: 15.00 GiB +Chunk size: 64.00 MiB + +Progress: 5% (768.00 MiB written) +Written: 1.50 GiB, Speed: 1.23 GiB/s +Progress: 10% (1.50 GiB written) +... +✓ Successfully created test.bin (15.00 GiB) +Time taken: 12.34 seconds +Average speed: 1.22 GiB/s +``` + +### Reading a file: +``` +Reading file: test.bin +File size: 15.00 GiB +Chunk size: 64.00 MiB + +Progress: 5% (768.00 MiB read) +Read: 1.50 GiB, Speed: 1.45 GiB/s +Progress: 10% (1.50 GiB read) +... +✓ Successfully read 15.00 GiB +Time taken: 10.12 seconds +Average speed: 1.48 GiB/s +SHA256: a3d5c... (if --hash was used) +``` + +## Error Handling + +Both tools include comprehensive error handling: +- File existence checks +- Disk space validation +- Permission verification +- Interrupt handling (Ctrl+C) +- Automatic cleanup on errors + +## Exit Codes + +- `0` - Success +- `1` - General error (file not found, permission denied, etc.) +- `130` - Interrupted by user (Ctrl+C) + +## License + +MIT License - Feel free to use and modify as needed. + +## Contributing + +Contributions welcome! Feel free to submit issues or pull requests. + +## Author + +Created for testing and benchmarking large file operations. diff --git a/make_big_file.py b/make_big_file.py new file mode 100644 index 0000000..f3a5047 --- /dev/null +++ b/make_big_file.py @@ -0,0 +1,218 @@ +#!/usr/bin/env python3 +"""Generate large binary files for testing purposes.""" + +import argparse +import os +import sys +import time + + +def parse_size(size_str): + """Parse size string like '15GB', '1.5TB', '500MB' to bytes.""" + size_str = size_str.upper().strip() + units = { + 'B': 1, + 'KB': 1024, + 'MB': 1024**2, + 'GB': 1024**3, + 'TB': 1024**4, + 'KIB': 1024, + 'MIB': 1024**2, + 'GIB': 1024**3, + 'TIB': 1024**4, + } + + for unit, multiplier in units.items(): + if size_str.endswith(unit): + try: + value = float(size_str[:-len(unit)]) + return int(value * multiplier) + except ValueError: + raise ValueError(f"Invalid size format: {size_str}") + + # Try parsing as raw bytes + try: + return int(size_str) + except ValueError: + raise ValueError(f"Invalid size format: {size_str}. Use format like '15GB', '1.5TB', '500MB'") + + +def format_bytes(bytes_val): + """Format bytes to human-readable string.""" + for unit in ['B', 'KiB', 'MiB', 'GiB', 'TiB']: + if bytes_val < 1024.0: + return f"{bytes_val:.2f} {unit}" + bytes_val /= 1024.0 + return f"{bytes_val:.2f} PiB" + + +def create_file(output_path, total_bytes, chunk_size, quiet=False): + """Create a file filled with zeros.""" + # Check if file already exists + if os.path.exists(output_path): + print(f"Error: File '{output_path}' already exists", file=sys.stderr) + return 1 + + # Check if directory exists + output_dir = os.path.dirname(output_path) + if output_dir and not os.path.exists(output_dir): + print(f"Error: Directory '{output_dir}' does not exist", file=sys.stderr) + return 1 + + # Check available disk space + try: + if hasattr(os, 'statvfs'): # Unix + stat = os.statvfs(output_dir or '.') + free_space = stat.f_bavail * stat.f_frsize + else: # Windows + import ctypes + free_bytes = ctypes.c_ulonglong(0) + ctypes.windll.kernel32.GetDiskFreeSpaceExW( + ctypes.c_wchar_p(output_dir or '.'), + None, None, + ctypes.pointer(free_bytes) + ) + free_space = free_bytes.value + + if free_space < total_bytes: + print(f"Error: Insufficient disk space. Required: {format_bytes(total_bytes)}, Available: {format_bytes(free_space)}", file=sys.stderr) + return 1 + except Exception as e: + print(f"Warning: Could not check disk space: {e}", file=sys.stderr) + + chunk = b"\x00" * chunk_size + + if not quiet: + print(f"Creating file: {output_path}") + print(f"Target size: {format_bytes(total_bytes)}") + print(f"Chunk size: {format_bytes(chunk_size)}") + print() + + start_time = time.time() + last_gb_log_time = start_time + last_gb_written = 0 + + try: + with open(output_path, "wb") as f: + written = 0 + last_reported_percent = -1 + + while written + chunk_size <= total_bytes: + f.write(chunk) + written += chunk_size + + if not quiet: + percent = int((written / total_bytes) * 100) + if percent != last_reported_percent and percent % 5 == 0: + print(f"Progress: {percent}% ({format_bytes(written)} written)") + last_reported_percent = percent + + # Per second GB log + now = time.time() + if now - last_gb_log_time >= 1.0: + gb_written = written / (1024**3) + gb_per_sec = (written - last_gb_written) / (1024**3) / (now - last_gb_log_time) + print(f"Written: {gb_written:.2f} GiB, Speed: {gb_per_sec:.2f} GiB/s") + last_gb_log_time = now + last_gb_written = written + + # Write leftover + leftover = total_bytes - written + if leftover: + f.write(b"\x00" * leftover) + written += leftover + + end_time = time.time() + elapsed = end_time - start_time + + if not quiet: + print() + print(f"✓ Successfully created {output_path} ({format_bytes(written)})") + if elapsed > 0: + print(f"Time taken: {elapsed:.2f} seconds") + print(f"Average speed: {format_bytes(written / elapsed)}/s") + + return 0 + + except KeyboardInterrupt: + print("\n\nInterrupted by user", file=sys.stderr) + # Clean up partial file + if os.path.exists(output_path): + print(f"Cleaning up partial file: {output_path}", file=sys.stderr) + try: + os.remove(output_path) + except Exception as e: + print(f"Warning: Could not remove partial file: {e}", file=sys.stderr) + return 130 + + except IOError as e: + print(f"Error writing file: {e}", file=sys.stderr) + # Clean up partial file + if os.path.exists(output_path): + try: + os.remove(output_path) + except Exception: + pass + return 1 + + +def main(): + parser = argparse.ArgumentParser( + description='Generate large binary files filled with zeros for testing purposes.', + epilog='Examples:\n' + ' %(prog)s output.bin 15GB\n' + ' %(prog)s test.dat 1.5TB --chunk-size 128MB\n' + ' %(prog)s small.bin 500MB --quiet', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + 'output', + help='Output file path' + ) + + parser.add_argument( + 'size', + help='File size (e.g., 15GB, 1.5TB, 500MB, 1073741824)' + ) + + parser.add_argument( + '--chunk-size', + default='64MB', + help='Chunk size for writing (default: 64MB)' + ) + + parser.add_argument( + '--quiet', '-q', + action='store_true', + help='Suppress progress output' + ) + + parser.add_argument( + '--version', + action='version', + version='%(prog)s 1.0.0' + ) + + args = parser.parse_args() + + try: + total_bytes = parse_size(args.size) + chunk_size = parse_size(args.chunk_size) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + if total_bytes <= 0: + print("Error: Size must be greater than 0", file=sys.stderr) + return 1 + + if chunk_size <= 0 or chunk_size > 1024**3: # Max 1GB chunk + print("Error: Chunk size must be between 1 byte and 1GB", file=sys.stderr) + return 1 + + return create_file(args.output, total_bytes, chunk_size, args.quiet) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/read_big_file.py b/read_big_file.py new file mode 100644 index 0000000..2751a99 --- /dev/null +++ b/read_big_file.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +"""Read and benchmark large file I/O performance.""" + +import argparse +import hashlib +import os +import sys +import time + + +def parse_size(size_str): + """Parse size string like '64MB', '128KB' to bytes.""" + size_str = size_str.upper().strip() + units = { + 'B': 1, + 'KB': 1024, + 'MB': 1024**2, + 'GB': 1024**3, + 'TB': 1024**4, + 'KIB': 1024, + 'MIB': 1024**2, + 'GIB': 1024**3, + 'TIB': 1024**4, + } + + for unit, multiplier in units.items(): + if size_str.endswith(unit): + try: + value = float(size_str[:-len(unit)]) + return int(value * multiplier) + except ValueError: + raise ValueError(f"Invalid size format: {size_str}") + + # Try parsing as raw bytes + try: + return int(size_str) + except ValueError: + raise ValueError(f"Invalid size format: {size_str}. Use format like '64MB', '128KB'") + + +def format_bytes(bytes_val): + """Format bytes to human-readable string.""" + for unit in ['B', 'KiB', 'MiB', 'GiB', 'TiB']: + if bytes_val < 1024.0: + return f"{bytes_val:.2f} {unit}" + bytes_val /= 1024.0 + return f"{bytes_val:.2f} PiB" + + +def read_file(input_path, chunk_size, compute_hash=False, quiet=False): + """Read a file and optionally compute its hash.""" + if not os.path.exists(input_path): + print(f"Error: File '{input_path}' does not exist", file=sys.stderr) + return 1 + + if not os.path.isfile(input_path): + print(f"Error: '{input_path}' is not a file", file=sys.stderr) + return 1 + + try: + total_bytes = os.path.getsize(input_path) + except OSError as e: + print(f"Error: Cannot get file size: {e}", file=sys.stderr) + return 1 + + if total_bytes == 0: + print(f"Warning: File is empty", file=sys.stderr) + return 0 + + if not quiet: + print(f"Reading file: {input_path}") + print(f"File size: {format_bytes(total_bytes)}") + print(f"Chunk size: {format_bytes(chunk_size)}") + if compute_hash: + print(f"Computing: SHA256 hash") + print() + + start_time = time.time() + last_gb_log_time = start_time + last_gb_read = 0 + + hash_obj = hashlib.sha256() if compute_hash else None + + try: + with open(input_path, "rb") as f: + bytes_read = 0 + last_reported_percent = -1 + + while True: + chunk = f.read(chunk_size) + if not chunk: + break + + bytes_read += len(chunk) + + if compute_hash: + hash_obj.update(chunk) + + if not quiet: + percent = int((bytes_read / total_bytes) * 100) + + if percent != last_reported_percent and percent % 5 == 0: + print(f"Progress: {percent}% ({format_bytes(bytes_read)} read)") + last_reported_percent = percent + + # Per second GB log + now = time.time() + if now - last_gb_log_time >= 1.0: + gb_read = bytes_read / (1024**3) + gb_per_sec = (bytes_read - last_gb_read) / (1024**3) / (now - last_gb_log_time) + print(f"Read: {gb_read:.2f} GiB, Speed: {gb_per_sec:.2f} GiB/s") + last_gb_log_time = now + last_gb_read = bytes_read + + end_time = time.time() + elapsed = end_time - start_time + + if not quiet: + print() + print(f"✓ Successfully read {format_bytes(bytes_read)}") + if elapsed > 0: + print(f"Time taken: {elapsed:.2f} seconds") + print(f"Average speed: {format_bytes(bytes_read / elapsed)}/s") + + if compute_hash: + print(f"SHA256: {hash_obj.hexdigest()}") + + return 0 + + except KeyboardInterrupt: + print("\n\nInterrupted by user", file=sys.stderr) + return 130 + + except IOError as e: + print(f"Error reading file: {e}", file=sys.stderr) + return 1 + + +def main(): + parser = argparse.ArgumentParser( + description='Read and benchmark large file I/O performance.', + epilog='Examples:\n' + ' %(prog)s largefile.bin\n' + ' %(prog)s test.dat --chunk-size 128MB\n' + ' %(prog)s data.bin --hash --quiet', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + 'input', + help='Input file path to read' + ) + + parser.add_argument( + '--chunk-size', + default='64MB', + help='Chunk size for reading (default: 64MB)' + ) + + parser.add_argument( + '--hash', + action='store_true', + help='Compute SHA256 hash of the file' + ) + + parser.add_argument( + '--quiet', '-q', + action='store_true', + help='Suppress progress output' + ) + + parser.add_argument( + '--version', + action='version', + version='%(prog)s 1.0.0' + ) + + args = parser.parse_args() + + try: + chunk_size = parse_size(args.chunk_size) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + return 1 + + if chunk_size <= 0 or chunk_size > 1024**3: # Max 1GB chunk + print("Error: Chunk size must be between 1 byte and 1GB", file=sys.stderr) + return 1 + + return read_file(args.input, chunk_size, args.hash, args.quiet) + + +if __name__ == "__main__": + sys.exit(main())