Skip to content

analysis Module

The analysis module provides high-level tools for evaluating structural quality, comparing protein models, and analyzing conformational ensembles.

Overview

After generating synthetic structures or ensembles, it is often necessary to quantify their precision and accuracy. This module provides a suite of geometric analyzers that handle alignment, RMSD calculation, and residue-level strain analysis.

Key Features

  • Structure Comparison: Optimal superposition of PDB models using the Kabsch algorithm.
  • Ensemble Analysis: Identification of the "medoid" structure (most representative) and calculation of ensemble-averaged RMSD.
  • Geometric Strain: Identification of localized structural distortions, such as non-trans peptide bonds.

API Reference

analysis

Classes

GeometryAnalyzer

High-level analysis suite for protein geometry and ensembles.

Source code in synth_pdb/analysis.py
class GeometryAnalyzer:
    """High-level analysis suite for protein geometry and ensembles."""

    @staticmethod
    def compare_pdbs(pdb_path1: str, pdb_path2: str, ca_only: bool = True) -> dict[str, Any]:
        """Compare two PDB files and calculate RMSD and optimal transformation.

        Args:
            pdb_path1: Path to the first PDB file (mobile).
            pdb_path2: Path to the second PDB file (reference).
            ca_only: If True, only uses C-alpha atoms for alignment and RMSD.

        Returns:
            Dictionary containing 'rmsd', 'rotation', and 'translation'.
        """
        s1 = pdb.PDBFile.read(pdb_path1).get_structure(model=1)
        s2 = pdb.PDBFile.read(pdb_path2).get_structure(model=1)

        if ca_only:
            m1 = s1[s1.atom_name == "CA"]
            m2 = s2[s2.atom_name == "CA"]
        else:
            # Common heavy atoms
            m1 = s1[struc.filter_heavy(s1)]
            m2 = s2[struc.filter_heavy(s2)]

        if len(m1) != len(m2):
            raise ValueError(f"Atom count mismatch: {len(m1)} vs {len(m2)}")

        r1 = m1.coord
        r2 = m2.coord

        rot, trans = kabsch_superposition(r1, r2)
        fitted = (rot @ r1.T).T + trans
        rmsd = calculate_rmsd(fitted, r2)

        return {
            "rmsd": rmsd,
            "rotation": rot,
            "translation": trans,
        }

    @staticmethod
    def analyze_ensemble_pdbs(pdb_paths: list[str]) -> dict[str, Any]:
        """Analyze a list of PDB files as an NMR-style ensemble.

        Args:
            pdb_paths: List of file paths to PDB files.

        Returns:
            Dictionary with 'avg_rmsd', 'medoid_path', and 'medoid_index'.
        """
        coords_list = []
        for path in pdb_paths:
            s = pdb.PDBFile.read(path).get_structure(model=1)
            ca = s[s.atom_name == "CA"]
            coords_list.append(ca.coord)

        avg_rmsd, _ = calculate_rmsd_to_average(coords_list)
        medoid_idx = find_medoid(coords_list, superimpose=True)

        return {
            "avg_rmsd": avg_rmsd,
            "medoid_index": medoid_idx,
            "medoid_path": pdb_paths[medoid_idx],
        }

    @staticmethod
    def calculate_residue_strain(pdb_path: str) -> dict[int, float]:
        """Calculates 'geometric strain' per residue.

        Currently defined as the deviation of the peptide bond omega angle
        from trans (180 deg).
        """
        s = pdb.PDBFile.read(pdb_path).get_structure(model=1)
        res_ids = np.unique(s.res_id)
        strain = {}

        for i in range(len(res_ids) - 1):
            rid1 = res_ids[i]
            rid2 = res_ids[i + 1]

            # Omega: CA(i) - C(i) - N(i+1) - CA(i+1)
            ca1 = s[(s.res_id == rid1) & (s.atom_name == "CA")]
            c1 = s[(s.res_id == rid1) & (s.atom_name == "C")]
            n2 = s[(s.res_id == rid2) & (s.atom_name == "N")]
            ca2 = s[(s.res_id == rid2) & (s.atom_name == "CA")]

            if all(len(x) > 0 for x in [ca1, c1, n2, ca2]):
                omega = calculate_dihedral(ca1[0].coord, c1[0].coord, n2[0].coord, ca2[0].coord)
                # Map to [0, 180] deviation from trans (180 deg)
                # (omega - 180 + 180) % 360 - 180 gives signed distance to 180
                diff = (omega - 180 + 180) % 360 - 180
                deviation = abs(diff)
                strain[int(rid1)] = deviation

        return strain
Functions
compare_pdbs(pdb_path1, pdb_path2, ca_only=True) staticmethod

Compare two PDB files and calculate RMSD and optimal transformation.

Parameters:

Name Type Description Default
pdb_path1 str

Path to the first PDB file (mobile).

required
pdb_path2 str

Path to the second PDB file (reference).

required
ca_only bool

If True, only uses C-alpha atoms for alignment and RMSD.

True

Returns:

Type Description
dict[str, Any]

Dictionary containing 'rmsd', 'rotation', and 'translation'.

Source code in synth_pdb/analysis.py
@staticmethod
def compare_pdbs(pdb_path1: str, pdb_path2: str, ca_only: bool = True) -> dict[str, Any]:
    """Compare two PDB files and calculate RMSD and optimal transformation.

    Args:
        pdb_path1: Path to the first PDB file (mobile).
        pdb_path2: Path to the second PDB file (reference).
        ca_only: If True, only uses C-alpha atoms for alignment and RMSD.

    Returns:
        Dictionary containing 'rmsd', 'rotation', and 'translation'.
    """
    s1 = pdb.PDBFile.read(pdb_path1).get_structure(model=1)
    s2 = pdb.PDBFile.read(pdb_path2).get_structure(model=1)

    if ca_only:
        m1 = s1[s1.atom_name == "CA"]
        m2 = s2[s2.atom_name == "CA"]
    else:
        # Common heavy atoms
        m1 = s1[struc.filter_heavy(s1)]
        m2 = s2[struc.filter_heavy(s2)]

    if len(m1) != len(m2):
        raise ValueError(f"Atom count mismatch: {len(m1)} vs {len(m2)}")

    r1 = m1.coord
    r2 = m2.coord

    rot, trans = kabsch_superposition(r1, r2)
    fitted = (rot @ r1.T).T + trans
    rmsd = calculate_rmsd(fitted, r2)

    return {
        "rmsd": rmsd,
        "rotation": rot,
        "translation": trans,
    }
analyze_ensemble_pdbs(pdb_paths) staticmethod

Analyze a list of PDB files as an NMR-style ensemble.

Parameters:

Name Type Description Default
pdb_paths list[str]

List of file paths to PDB files.

required

Returns:

Type Description
dict[str, Any]

Dictionary with 'avg_rmsd', 'medoid_path', and 'medoid_index'.

Source code in synth_pdb/analysis.py
@staticmethod
def analyze_ensemble_pdbs(pdb_paths: list[str]) -> dict[str, Any]:
    """Analyze a list of PDB files as an NMR-style ensemble.

    Args:
        pdb_paths: List of file paths to PDB files.

    Returns:
        Dictionary with 'avg_rmsd', 'medoid_path', and 'medoid_index'.
    """
    coords_list = []
    for path in pdb_paths:
        s = pdb.PDBFile.read(path).get_structure(model=1)
        ca = s[s.atom_name == "CA"]
        coords_list.append(ca.coord)

    avg_rmsd, _ = calculate_rmsd_to_average(coords_list)
    medoid_idx = find_medoid(coords_list, superimpose=True)

    return {
        "avg_rmsd": avg_rmsd,
        "medoid_index": medoid_idx,
        "medoid_path": pdb_paths[medoid_idx],
    }
calculate_residue_strain(pdb_path) staticmethod

Calculates 'geometric strain' per residue.

Currently defined as the deviation of the peptide bond omega angle from trans (180 deg).

Source code in synth_pdb/analysis.py
@staticmethod
def calculate_residue_strain(pdb_path: str) -> dict[int, float]:
    """Calculates 'geometric strain' per residue.

    Currently defined as the deviation of the peptide bond omega angle
    from trans (180 deg).
    """
    s = pdb.PDBFile.read(pdb_path).get_structure(model=1)
    res_ids = np.unique(s.res_id)
    strain = {}

    for i in range(len(res_ids) - 1):
        rid1 = res_ids[i]
        rid2 = res_ids[i + 1]

        # Omega: CA(i) - C(i) - N(i+1) - CA(i+1)
        ca1 = s[(s.res_id == rid1) & (s.atom_name == "CA")]
        c1 = s[(s.res_id == rid1) & (s.atom_name == "C")]
        n2 = s[(s.res_id == rid2) & (s.atom_name == "N")]
        ca2 = s[(s.res_id == rid2) & (s.atom_name == "CA")]

        if all(len(x) > 0 for x in [ca1, c1, n2, ca2]):
            omega = calculate_dihedral(ca1[0].coord, c1[0].coord, n2[0].coord, ca2[0].coord)
            # Map to [0, 180] deviation from trans (180 deg)
            # (omega - 180 + 180) % 360 - 180 gives signed distance to 180
            diff = (omega - 180 + 180) % 360 - 180
            deviation = abs(diff)
            strain[int(rid1)] = deviation

    return strain

Scientific Principles

Kabsch Superposition

To compare two structures, we must find the rotation matrix \(R\) and translation vector \(\mathbf{t}\) that minimize the Root Mean Square Deviation (RMSD):

\[RMSD = \sqrt{\frac{1}{N} \sum_{i=1}^N \| R\mathbf{x}_i + \mathbf{t} - \mathbf{y}_i \|^2}\]

The GeometryAnalyzer uses the Kabsch algorithm (via SVD) to find the optimal \(R\) that aligns the mobile structure \(\mathbf{x}\) to the reference structure \(\mathbf{y}\).

Ensemble Medoid

For an ensemble of NMR structures or MD frames, the medoid is the structure \(k\) that has the minimum average RMSD to all other structures in the set:

\[\text{Medoid} = \arg\min_k \frac{1}{N} \sum_{j=1}^N RMSD(k, j)\]

This is a more robust representative of the ensemble than a simple average structure, which may have physically impossible bond lengths or angles.

Usage Example

from synth_pdb.analysis import GeometryAnalyzer

# 1. Compare two structures (e.g., predicted vs. ground truth)
results = GeometryAnalyzer.compare_pdbs(
    "model.pdb", 
    "reference.pdb", 
    ca_only=True
)
print(f"RMSD: {results['rmsd']:.2f} Å")

# 2. Analyze an NMR ensemble
ensemble_files = ["frame1.pdb", "frame2.pdb", "frame3.pdb"]
stats = GeometryAnalyzer.analyze_ensemble_pdbs(ensemble_files)
print(f"Ensemble Precision: {stats['avg_rmsd']:.2f} Å")
print(f"Most representative model: {stats['medoid_path']}")

# 3. Check for geometric strain
strain = GeometryAnalyzer.calculate_residue_strain("model.pdb")
for res_id, dev in strain.items():
    if dev > 20: # Large deviation from trans
        print(f"Warning: High omega strain at residue {res_id}: {dev:.1f}°")