Source code for lammpskit.config

"""
LAMMPSKit Configuration Module
==============================

Essential validation functions and constants for robust LAMMPS trajectory analysis.
This module provides infrastructure for input validation, parameter checking, and
standardized constants used across LAMMPSKit analysis workflows.

Architecture Design
-------------------
The configuration module follows a functional approach rather than class-based
configuration objects, prioritizing simplicity and direct validation at point-of-use.
This design reduces coupling while ensuring consistent parameter validation across
all analysis modules.

Key Components
--------------
- Column mapping constants for LAMMPS dump file parsing
- Data type labels for displacement and property analysis
- Input validation functions with domain-specific error messages
- Parameter range checking with physics-aware warnings

Validation Philosophy
---------------------
Validation functions use a "fail-fast" approach with informative error messages
to catch configuration issues early in analysis workflows. Physics-aware warnings
help identify potential coordinate system or unit scale problems common in MD
simulations.

Performance Considerations
--------------------------
Validation overhead is O(1) for most functions, O(n) for file list validation.
Pre-validate parameters once at workflow start rather than per-timestep for
optimal performance in long trajectory analysis.
"""

from typing import List, Optional
import os


# =============================================================================
# LAMMPS DATA STRUCTURE CONSTANTS
# =============================================================================

# Data type labels for displacement analysis and statistical reporting
# Used by plotting functions and test validation to ensure consistent labeling
DISPLACEMENT_DATA_LABELS = [
    "abs total disp",  # Total displacement magnitude (Angstroms)
    "density - mass",  # Mass density (g/cm³)
    "temp (K)",  # Temperature (Kelvin)
    "z disp (A)",  # Vertical displacement component (Angstroms)
    "lateral disp (A)",  # Horizontal displacement magnitude (Angstroms)
    "outward disp vector (A)",  # Radial displacement component (Angstroms)
]

# LAMMPS dump file column mappings for trajectory parsing
# Standard format: id type charge x y z vx vy vz fx fy fz
# Indices correspond to: (id, type, charge, x, y, z, vx, vy, vz, fx, fy, fz)
DEFAULT_COLUMNS_TO_READ = (0, 1, 2, 3, 4, 5, 9, 10, 11, 12)  # Core analysis columns
EXTENDED_COLUMNS_TO_READ = (0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16)  # Full property set


# =============================================================================
# VALIDATION FUNCTIONS
# =============================================================================


[docs] def validate_filepath(filepath: str, check_existence: bool = True) -> None: """ Validate file path for LAMMPS trajectory and output files. Ensures filepath is a valid string and optionally verifies file existence. Essential for preventing downstream failures in trajectory reading and analysis output generation. Parameters ---------- filepath : str Path to file for validation. Supports both absolute and relative paths. check_existence : bool, optional Whether to verify file exists on disk (default: True). Set False for output file validation. Raises ------ TypeError If filepath is not a string. ValueError If filepath is empty string. FileNotFoundError If file doesn't exist and check_existence=True. Examples -------- Validate input trajectory file: >>> validate_filepath('trajectory.lammpstrj') Validate output path without existence check: >>> validate_filepath('output/analysis.pdf', check_existence=False) """ if not isinstance(filepath, str): raise TypeError("filepath must be a string") if not filepath: raise ValueError("filepath cannot be empty") if check_existence and not os.path.exists(filepath): raise FileNotFoundError(f"File not found: {filepath}")
[docs] def validate_dataindex(dataindex: int, max_index: Optional[int] = None) -> None: """ Validate array index for displacement data and property arrays. Ensures safe array indexing with support for Python negative indexing. Primarily used for accessing DISPLACEMENT_DATA_LABELS and trajectory property arrays. Parameters ---------- dataindex : int Index to validate. Supports negative indexing (e.g., -1 for last element). max_index : int, optional Maximum allowed index. If None, uses DISPLACEMENT_DATA_LABELS length. Raises ------ ValueError If dataindex is not integer or out of valid range. Notes ----- Negative indexing follows Python conventions: -1 = last element, -n = first element for array of length n. Examples -------- Validate index for displacement data: >>> validate_dataindex(2) # Access 'temp (K)' >>> validate_dataindex(-1) # Access last element """ if not isinstance(dataindex, int): raise ValueError("dataindex must be an integer") # Allow negative indexing (common in Python) if max_index is not None: if dataindex >= max_index or dataindex < -max_index: raise ValueError(f"dataindex {dataindex} is out of range for max_index {max_index}") else: # Use the DISPLACEMENT_DATA_LABELS length for validation like the original total_length = len(DISPLACEMENT_DATA_LABELS) if dataindex < -total_length or dataindex >= total_length: raise ValueError(f"dataindex {dataindex} is out of range")
[docs] def validate_file_list(file_list: List[str]) -> None: """ Validate list of trajectory files for batch processing. Ensures all files exist and are accessible before starting computationally expensive analysis workflows. Prevents partial analysis completion due to missing files in multi-trajectory studies. Parameters ---------- file_list : List[str] List of file paths to validate. Commonly used for time series analysis across multiple LAMMPS dump files. Raises ------ ValueError If file_list is not a list/tuple, is empty, or contains non-string elements. FileNotFoundError If any files don't exist. Reports all missing files simultaneously for efficient error handling. Performance Notes ----------------- Complexity: O(n) where n is number of files. For large file lists (>1000), consider validating in chunks. Examples -------- Validate trajectory sequence: >>> files = ['step_0.lammpstrj', 'step_1000.lammpstrj', 'step_2000.lammpstrj'] >>> validate_file_list(files) """ if not isinstance(file_list, (list, tuple)): raise ValueError("file_list must be a list") if not file_list: raise ValueError("file_list cannot be empty") # Collect all missing files to report them together (like original) missing_files = [] for filepath in file_list: if not isinstance(filepath, str): raise ValueError("All file paths must be strings") if not os.path.exists(filepath): missing_files.append(filepath) if missing_files: raise FileNotFoundError(f"The following files were not found: {missing_files}")
[docs] def validate_loop_parameters(loop_start: int, loop_end: int) -> None: """ Validate timestep range parameters for trajectory analysis loops. Ensures valid timestep iteration bounds for LAMMPS trajectory processing. Critical for preventing infinite loops or invalid memory access in temporal analysis functions. Parameters ---------- loop_start : int Starting timestep index (inclusive). Must be non-negative. loop_end : int Ending timestep index (inclusive). Must be >= loop_start. Raises ------ ValueError If parameters are not integers, negative, or loop_start > loop_end. Notes ----- Both indices are inclusive: range [loop_start, loop_end]. For trajectory with N timesteps, valid range is [0, N-1]. Examples -------- Validate analysis range: >>> validate_loop_parameters(0, 1000) # Analyze first 1001 timesteps >>> validate_loop_parameters(500, 1500) # Analyze middle portion """ if not isinstance(loop_start, int) or not isinstance(loop_end, int): raise ValueError("loop_start and loop_end must be integers") if loop_start < 0: raise ValueError("loop_start must be non-negative") if loop_end < 0: raise ValueError("loop_end must be non-negative") if loop_start > loop_end: raise ValueError(f"loop_start ({loop_start}) must be less than or equal to loop_end ({loop_end})")
[docs] def validate_chunks_parameter(nchunks: int, min_chunks: int = 1, max_chunks: int = 1000) -> None: """ Validate spatial binning parameters for density and distribution analysis. Ensures appropriate bin count for spatial discretization in electrochemical cell analysis. Balances statistical significance with computational efficiency. Parameters ---------- nchunks : int Number of spatial bins/chunks for discretization. min_chunks : int, optional Minimum allowed bins (default: 1). Must be positive. max_chunks : int, optional Maximum allowed bins (default: 1000). Prevents excessive memory usage. Raises ------ ValueError If nchunks is not integer or outside [min_chunks, max_chunks] range. Performance Notes ----------------- Memory usage: O(nchunks) per property per timestep. Computation time: O(N * nchunks) where N is atom count. Optimal range: 10-100 chunks for most electrochemical systems. Examples -------- Validate binning for layer analysis: >>> validate_chunks_parameter(50) # 50 z-direction layers >>> validate_chunks_parameter(10, min_chunks=5, max_chunks=20) """ if not isinstance(nchunks, int): raise ValueError("nchunks must be an integer") # Provide more specific error messages like the original if nchunks < min_chunks: raise ValueError(f"nchunks must be at least {min_chunks}") if nchunks > max_chunks: raise ValueError(f"nchunks cannot exceed {max_chunks}")
[docs] def validate_cluster_parameters(z_filament_lower_limit: float, z_filament_upper_limit: float, thickness: float) -> None: """ Validate geometric parameters for filament connectivity analysis. Ensures physically meaningful parameters for OVITO-based cluster analysis in electrochemical cell simulations. Validates filament detection geometry and provides physics-aware warnings for common coordinate system issues. Parameters ---------- z_filament_lower_limit : float Lower z-coordinate bound for filament connectivity (Angstroms). Typically electrode surface position. z_filament_upper_limit : float Upper z-coordinate bound for filament connectivity (Angstroms). Typically opposite electrode surface position. thickness : float Filament thickness parameter for cluster detection (Angstroms). Controls sensitivity of connectivity algorithm. Raises ------ TypeError If parameters are not numeric (int or float). ValueError If z_lower >= z_upper or thickness <= 0. Warns ----- UserWarning If negative z-coordinates detected (potential coordinate system issue). If large z-values detected (potential unit scale issue). Physics Notes ------------- Typical electrochemical cell dimensions: 20-100 Å electrode separation. Filament thickness: 2-10 Å depending on atom size and connectivity criteria. Z-coordinates should span electrode-to-electrode distance. Examples -------- Validate HfTaO cell parameters: >>> validate_cluster_parameters(-10.0, 50.0, 3.5) # 60 Å cell, 3.5 Å thickness >>> validate_cluster_parameters(0.0, 30.0, 2.0) # 30 Å cell, 2.0 Å thickness """ import warnings # Parameter type validation if not isinstance(z_filament_lower_limit, (int, float)): raise TypeError("z_filament_lower_limit must be numeric (int or float)") if not isinstance(z_filament_upper_limit, (int, float)): raise TypeError("z_filament_upper_limit must be numeric (int or float)") if not isinstance(thickness, (int, float)): raise TypeError("thickness must be numeric (int or float)") # Parameter range validation (errors) if z_filament_lower_limit >= z_filament_upper_limit: raise ValueError( f"z_filament_lower_limit ({z_filament_lower_limit}) must be less than z_filament_upper_limit ({z_filament_upper_limit})" ) if thickness <= 0: raise ValueError(f"thickness ({thickness}) must be positive") # Parameter range validation (warnings) if z_filament_lower_limit < 0: warnings.warn( f"z_filament_lower_limit ({z_filament_lower_limit}) is negative, which might indicate coordinate system issues", UserWarning, ) if abs(z_filament_lower_limit) > 1000 or abs(z_filament_upper_limit) > 1000: warnings.warn( f"Large z-coordinate values detected (z_lower={z_filament_lower_limit}, z_upper={z_filament_upper_limit}), which might indicate unit scale issues", UserWarning, )
# Legacy alias for backward compatibility (in case any tests use it) extract_element_label_from_filename = None # Will be imported from data_processing module