Chemical Shifts API

`chemical_shifts`

Classes

`ShiftX2Predictor`

Wrapper for the external ShiftX2 predictor.

Requires the 'shiftx2.py' (or 'shiftx2') executable to be in the system PATH, in the directory specified by SHIFTX2_DIR, or in typical installation locations.

ShiftX2 can be installed via SBGrid: 'sbgrid-cli install shiftx2' It can also be downloaded from here: http://www.shiftx2.ca/download.html

Source code in synth_nmr/chemical_shifts.py

class ShiftX2Predictor:
    """
    Wrapper for the external ShiftX2 predictor.

    Requires the 'shiftx2.py' (or 'shiftx2') executable to be in the system PATH,
    in the directory specified by SHIFTX2_DIR, or in typical installation locations.

    ShiftX2 can be installed via SBGrid: 'sbgrid-cli install shiftx2'
    It can also be downloaded from here: http://www.shiftx2.ca/download.html
    """

    def __init__(self, executable: str = "shiftx2.py"):
        self.executable = self._resolve_path(executable)

    def _resolve_path(self, executable: str) -> str:
        """Resolve the full path to the ShiftX2 executable."""
        from shutil import which

        # 1. Check if the provided name is already found by 'which'
        # (this handles absolute paths, relative paths, and PATH search)
        found = which(executable)
        if found:
            return found

        # 2. Check SHIFTX2_DIR environment variable
        shiftx2_dir = os.environ.get("SHIFTX2_DIR")
        if shiftx2_dir:
            for name in [executable, "shiftx2.py", "shiftx2"]:
                if not os.path.isabs(name):
                    path = os.path.join(shiftx2_dir, name)
                    found = which(path)
                    if found:
                        return found

        # 3. Check typical locations
        home = os.path.expanduser("~")
        typical_locations = [
            os.path.join(home, "shiftx2", "shiftx2.py"),
            "/opt/shiftx2/shiftx2.py",
            "/usr/local/bin/shiftx2.py",
            "/usr/local/bin/shiftx2",
        ]
        for loc in typical_locations:
            found = which(loc)
            if found:
                return found

        return executable

    def is_available(self) -> bool:
        """Check if ShiftX2 executable is available."""
        from shutil import which

        return which(self.executable) is not None

    def predict(self, structure: struc.AtomArray) -> Dict[str, Dict[int, Dict[str, float]]]:
        """
        Run ShiftX2 on a Biotite AtomArray.

        Args:
            structure: Biotite AtomArray.

        Returns:
            Predicted shifts in the standard dictionary format.
        """
        if not self.is_available():
            raise RuntimeError(
                f"ShiftX2 executable '{self.executable}' not found. "
                "Please install it (e.g., via SBGrid), add it to your PATH, or set SHIFTX2_DIR."
            )

        with tempfile.TemporaryDirectory() as tmpdir:
            pdb_path = os.path.join(tmpdir, "input.pdb")
            # Some versions of shiftx2 ignore -o and append .cs to the input file
            expected_out_path = pdb_path + ".cs"

            # 1. Write structure to temporary PDB
            pdb_file = pdb.PDBFile()
            pdb_file.set_structure(structure)
            pdb_file.write(pdb_path)

            # 2. Execute ShiftX2
            # Command: shiftx2.py -i <input.pdb>
            try:
                subprocess.run(
                    [self.executable, "-i", pdb_path], check=True, capture_output=True, text=True
                )
            except subprocess.CalledProcessError as e:
                logger.error(f"ShiftX2 failed: {e.stderr}")
                raise RuntimeError(f"ShiftX2 execution failed: {e.stderr}")

            logger.debug(f"Tmpdir contents: {os.listdir(tmpdir)}")
            logger.debug(f"Expecting out path: {expected_out_path}")

            # 3. Parse ShiftX2 output (Simplified CSV/Tabular parsing)
            return self._parse_output(expected_out_path)

    def _parse_output(self, file_path: str) -> Dict[str, Dict[int, Dict[str, float]]]:
        """
        Parse ShiftX2's default output format.
        Expects a tabular format with columns like: NUM, RES, ATOMNAME, SHIFT
        """
        shifts: Dict[str, Dict[int, Dict[str, float]]] = {
            "A": {}
        }  # Assuming single chain for simplicity

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"ShiftX2 output file not found: {file_path}")

        with open(file_path) as f:
            lines = f.readlines()

        # Skip header lines (usually starts with 'NUM' or similar)
        header_found = False
        for line in lines:
            line = line.strip()
            if not line:
                continue

            if "NUM" in line and "RES" in line:
                header_found = True
                continue
            if not header_found:
                continue

            parts = [p.strip() for p in line.split(",")]
            if len(parts) >= 4:
                try:
                    res_id = int(parts[0])
                    # res_name = parts[1]
                    atom_name = parts[2]
                    val = float(parts[3])

                    if res_id not in shifts["A"]:
                        shifts["A"][res_id] = {}
                    shifts["A"][res_id][atom_name] = val
                except ValueError:
                    continue

        return shifts

Functions

`is_available()`

Check if ShiftX2 executable is available.

Source code in synth_nmr/chemical_shifts.py

def is_available(self) -> bool:
    """Check if ShiftX2 executable is available."""
    from shutil import which

    return which(self.executable) is not None

`predict(structure)`

Run ShiftX2 on a Biotite AtomArray.

Parameters:

Name	Type	Description	Default
`structure`	`AtomArray`	Biotite AtomArray.	required

Returns:

Type	Description
`Dict[str, Dict[int, Dict[str, float]]]`	Predicted shifts in the standard dictionary format.

Source code in synth_nmr/chemical_shifts.py

def predict(self, structure: struc.AtomArray) -> Dict[str, Dict[int, Dict[str, float]]]:
    """
    Run ShiftX2 on a Biotite AtomArray.

    Args:
        structure: Biotite AtomArray.

    Returns:
        Predicted shifts in the standard dictionary format.
    """
    if not self.is_available():
        raise RuntimeError(
            f"ShiftX2 executable '{self.executable}' not found. "
            "Please install it (e.g., via SBGrid), add it to your PATH, or set SHIFTX2_DIR."
        )

    with tempfile.TemporaryDirectory() as tmpdir:
        pdb_path = os.path.join(tmpdir, "input.pdb")
        # Some versions of shiftx2 ignore -o and append .cs to the input file
        expected_out_path = pdb_path + ".cs"

        # 1. Write structure to temporary PDB
        pdb_file = pdb.PDBFile()
        pdb_file.set_structure(structure)
        pdb_file.write(pdb_path)

        # 2. Execute ShiftX2
        # Command: shiftx2.py -i <input.pdb>
        try:
            subprocess.run(
                [self.executable, "-i", pdb_path], check=True, capture_output=True, text=True
            )
        except subprocess.CalledProcessError as e:
            logger.error(f"ShiftX2 failed: {e.stderr}")
            raise RuntimeError(f"ShiftX2 execution failed: {e.stderr}")

        logger.debug(f"Tmpdir contents: {os.listdir(tmpdir)}")
        logger.debug(f"Expecting out path: {expected_out_path}")

        # 3. Parse ShiftX2 output (Simplified CSV/Tabular parsing)
        return self._parse_output(expected_out_path)

Functions

`predict_chemical_shifts(structure)`

Predict chemical shifts using the highest-accuracy available model. Attempt to use SHIFTX2 first, as it provides the best baseline accuracy. If SHIFTX2 is not available in the system PATH, or if prediction fails, falls back to the SPARTA+ empirical prediction model.

Note: The NeuralShiftPredictor (available in synth_nmr.neural_shifts) is strictly experimental and serves as an educational example of how Protein Language Models (PLMs) could be applied to NMR prediction. It currently requires extensive retraining with geometric attention mechanisms to match or exceed classical empirical force fields.

Source code in synth_nmr/chemical_shifts.py

def predict_chemical_shifts(structure: struc.AtomArray) -> Dict[str, Dict[int, Dict[str, float]]]:
    """
    Predict chemical shifts using the highest-accuracy available model.
    Attempt to use SHIFTX2 first, as it provides the best baseline accuracy.
    If SHIFTX2 is not available in the system PATH, or if prediction fails,
    falls back to the SPARTA+ empirical prediction model.

    Note: The `NeuralShiftPredictor` (available in `synth_nmr.neural_shifts`) is
    strictly experimental and serves as an educational example of how Protein
    Language Models (PLMs) could be applied to NMR prediction. It currently requires
    extensive retraining with geometric attention mechanisms to match or exceed
    classical empirical force fields.
    """
    shiftx_predictor = ShiftX2Predictor()

    if shiftx_predictor.is_available():
        try:
            shifts = shiftx_predictor.predict(structure)
            if shifts:
                logger.info("Successfully predicted chemical shifts using SHIFTX2.")
                return shifts
            else:
                logger.warning(
                    "SHIFTX2 returned empty predictions. Falling back to empirical SPARTA+ model."
                )
        except Exception as e:
            logger.warning(
                f"SHIFTX2 prediction failed: {e}. Falling back to empirical SPARTA+ model."
            )
    else:
        logger.warning(
            "SHIFTX2 executable not found. Falling back to empirical SPARTA+ model. "
            "To use SHIFTX2, ensure it is in your PATH or set the SHIFTX2_DIR environment variable. "
            "For installation, see http://www.shiftx2.ca/download.html or use SBGrid ('sbgrid-cli install shiftx2')."
        )
    return predict_empirical_shifts(structure)

`predict_empirical_shifts(structure)`

Predict chemical shifts based on secondary structure and ring currents.

This function combines random coil shifts, secondary structure-based offsets (SPARTA+-like), and ring current effects from aromatic residues to predict protein chemical shifts.

EDUCATIONAL NOTE - Prediction Algorithm:

Calculate Backbone Dihedrals (Phi/Psi) for every residue.
Classify Secondary Structure:
Alpha: Phi ~ -60, Psi ~ -45
Beta: Phi ~ -120, Psi ~ 120
Coil: Everything else
Calculate Shift: Shift = Random_Coil + Structure_Offset + Noise

LIMITATIONS: - Ring Current Effects (for protons): While an O(N^2) geometry check was previously a concern, these effects are now included for protons near aromatic rings (Phe, Tyr, Trp, His) using a point-dipole approximation. This is crucial for protons in close proximity to aromatic systems. Carbon atoms are not currently included for ring current effects. - H-Bonding: Hydrogen bonds affect Amide H shifts significantly. We omit this for simplicity. - Sequence History: Real shifts depend on (i-1) and (i+1) neighbor types. We omit this for simplicity.

Parameters:

Name	Type	Description	Default
`structure`	`AtomArray`	A biotite.structure.AtomArray containing the protein. Must not be empty.	required

Returns:

Type	Description
`Dict[str, Dict[int, Dict[str, float]]]`	A nested dictionary of predicted shifts: {chain_id: {res_id: {atom_name: value}}}

Raises:

Type	Description
`TypeError`	If the input is not a biotite.structure.AtomArray.
`ValueError`	If the input structure is empty.

Source code in synth_nmr/chemical_shifts.py

def predict_empirical_shifts(structure: struc.AtomArray) -> Dict[str, Dict[int, Dict[str, float]]]:
    """
    Predict chemical shifts based on secondary structure and ring currents.

    This function combines random coil shifts, secondary structure-based offsets (SPARTA+-like),
    and ring current effects from aromatic residues to predict protein chemical shifts.

    EDUCATIONAL NOTE - Prediction Algorithm:
    ========================================
    1. Calculate Backbone Dihedrals (Phi/Psi) for every residue.
    2. Classify Secondary Structure:
       - Alpha: Phi ~ -60, Psi ~ -45
       - Beta:  Phi ~ -120, Psi ~ 120
       - Coil:  Everything else
    3. Calculate Shift:
       Shift = Random_Coil + Structure_Offset + Noise

    LIMITATIONS:
    - Ring Current Effects (for protons): While an O(N^2) geometry check was previously
      a concern, these effects are now included for protons near aromatic rings
      (Phe, Tyr, Trp, His) using a point-dipole approximation. This is crucial for
      protons in close proximity to aromatic systems. Carbon atoms are not currently
      included for ring current effects.
    - H-Bonding: Hydrogen bonds affect Amide H shifts significantly. We omit this for simplicity.
    - Sequence History: Real shifts depend on (i-1) and (i+1) neighbor types. We omit this for simplicity.

    Args:
        structure: A biotite.structure.AtomArray containing the protein. Must not be empty.

    Returns:
        A nested dictionary of predicted shifts: {chain_id: {res_id: {atom_name: value}}}

    Raises:
        TypeError: If the input is not a biotite.structure.AtomArray.
        ValueError: If the input structure is empty.
    """
    logger.info("Predicting chemical shifts (SPARTA+ model with ring currents)...")

    # 1. Input Validation
    if not isinstance(structure, struc.AtomArray):
        raise TypeError("Input 'structure' must be a biotite.structure.AtomArray.")
    if structure.array_length() == 0:
        logger.warning("Input 'structure' is empty. Cannot predict chemical shifts.")
        return {}

    # Filter for amino acids to avoid mismatches with HETATMs (waters/ions)
    protein_mask = struc.filter_amino_acids(structure)
    structure = structure[protein_mask]
    if structure.array_length() == 0:
        return {}

    try:
        # 2. Get Secondary Structure and Aromatic Ring Info
        ss_list = get_secondary_structure(structure)
        rings = _get_aromatic_rings(structure)
        if rings.size > 0:
            logger.debug(f"Found {rings.shape[0]} aromatic rings for ring current calculation.")

        # 3. Iterate through residues and calculate shifts
        results: Dict[str, Dict[int, Dict[str, float]]] = {}
        for i, start_idx in enumerate(struc.get_residue_starts(structure)):
            end_idx = (
                struc.get_residue_starts(structure)[i + 1]
                if i + 1 < len(struc.get_residue_starts(structure))
                else None
            )
            res_atoms = structure[start_idx:end_idx]

            res_id = res_atoms.res_id[0]
            res_name = res_atoms.res_name[0]
            chain_id = res_atoms.chain_id[0]

            rc_shifts = _get_random_coil_shifts(res_name)
            if not rc_shifts:
                logger.debug(f"Skipping non-standard residue: {res_name} {res_id}")
                continue

            ss_state = ss_list[i] if i < len(ss_list) else "coil"
            logger.debug(f"Processing ResID {res_id} ({res_name}), SS: {ss_state}")

            atom_shifts = {}

            for atom_type, base_val in rc_shifts.items():
                if base_val == 0:  # Skip atoms with no defined random coil shift (e.g., Proline H)
                    continue

                # Add secondary structure offset to the random coil baseline
                val = _apply_secondary_structure_offsets(atom_type, ss_state, base_val)

                # Add ring current shift for protons
                if rings.size > 0 and atom_type.startswith("H"):
                    try:
                        target_atom = res_atoms[res_atoms.atom_name == atom_type][0]
                        rc_shift = _calculate_ring_current_shift(target_atom.coord, rings)
                        val += rc_shift
                    except IndexError:
                        # Atom not found in this specific residue, skip ring current
                        logger.debug(
                            f"Atom {atom_type} not found in residue {res_id} for ring current calculation."
                        )
                        pass

                # Add small random noise for "realism" (0.1 - 0.3 ppm)
                # Experimental assignments always have error/variation
                noise = np.random.normal(0, _NOISE_SCALE) if base_val != 0 else 0
                val += noise

                atom_shifts[atom_type] = round(val, 3)

            if chain_id not in results:
                results[chain_id] = {}
            results[chain_id][res_id] = atom_shifts

        logger.info(
            f"Successfully predicted chemical shifts for {struc.get_residue_count(structure)} residues."
        )
        return results

    except Exception as e:
        logger.error(
            f"An unexpected error occurred during chemical shift prediction: {e}", exc_info=True
        )
        raise

`calculate_csi(shifts, structure)`

Calculate the Chemical Shift Index (CSI) for C-alpha atoms.

The CSI is the deviation of an observed chemical shift from its random coil value. It is a reliable indicator of secondary structure. - Positive Delta(CA) > 0.7 ppm suggests a Helical conformation. - Negative Delta(CA) < -0.7 ppm suggests a Sheet conformation.

Parameters:

Name	Type	Description	Default
`shifts`	`Dict[str, Dict[int, Dict[str, float]]]`	A dictionary of chemical shifts, as produced by `predict_chemical_shifts`.	required
`structure`	`AtomArray`	A biotite.structure.AtomArray, required for mapping residue IDs to names.	required

Returns:

Type	Description
`Dict[str, Dict[int, float]]`	A dictionary containing the C-alpha CSI for each residue: {chain_id: {res_id: delta_ppm}}

Raises:

Type	Description
`TypeError`	If inputs are not of the correct type.
`ValueError`	If inputs are empty.

Source code in synth_nmr/chemical_shifts.py

def calculate_csi(
    shifts: Dict[str, Dict[int, Dict[str, float]]], structure: struc.AtomArray
) -> Dict[str, Dict[int, float]]:
    """
    Calculate the Chemical Shift Index (CSI) for C-alpha atoms.

    The CSI is the deviation of an observed chemical shift from its random coil value.
    It is a reliable indicator of secondary structure.
    - Positive Delta(CA) > 0.7 ppm suggests a Helical conformation.
    - Negative Delta(CA) < -0.7 ppm suggests a Sheet conformation.

    Args:
        shifts: A dictionary of chemical shifts, as produced by `predict_chemical_shifts`.
        structure: A biotite.structure.AtomArray, required for mapping residue IDs to names.

    Returns:
        A dictionary containing the C-alpha CSI for each residue: {chain_id: {res_id: delta_ppm}}

    Raises:
        TypeError: If inputs are not of the correct type.
        ValueError: If inputs are empty.
    """
    logger.info("Calculating Chemical Shift Index (CSI) for C-alpha atoms...")

    # 1. Input Validation
    if not isinstance(shifts, dict):
        raise TypeError("Input 'shifts' must be a dictionary.")
    if not shifts:
        logger.warning("Input 'shifts' dictionary is empty. Returning no CSI data.")
        return {}
    if not isinstance(structure, struc.AtomArray):
        raise TypeError("Input 'structure' must be a biotite.structure.AtomArray.")
    if structure.array_length() == 0:
        logger.warning("Input 'structure' is empty. Cannot calculate CSI.")
        return {}

    try:
        # 2. Create a mapping from residue ID to residue name for quick lookup
        res_names = {}
        for idx in struc.get_residue_starts(structure):
            res = structure[idx]
            res_names[res.res_id] = res.res_name
        if not res_names:
            logger.warning("Could not create a residue map from the provided structure.")
            return {}

        # 3. Calculate CSI
        csi_data: Dict[str, Dict[int, float]] = {}
        for chain_id, chain_shifts in shifts.items():
            if not isinstance(chain_shifts, dict):
                continue
            csi_data[chain_id] = {}

            for res_id, atom_shifts in chain_shifts.items():
                if not isinstance(atom_shifts, dict):
                    continue

                res_name = res_names.get(res_id)
                if not res_name:
                    logger.debug(
                        f"Residue ID {res_id} from shifts not found in structure. Skipping."
                    )
                    continue

                if "CA" in atom_shifts and res_name in RANDOM_COIL_SHIFTS:
                    measured = atom_shifts["CA"]
                    random_coil_val = RANDOM_COIL_SHIFTS[res_name].get("CA")

                    if random_coil_val is not None:
                        delta = measured - random_coil_val
                        csi_data[chain_id][res_id] = round(delta, 3)
                        logger.debug(
                            f"CSI for {res_name} {res_id}: Measured={measured}, RC={random_coil_val}, Delta={delta}"
                        )
                    else:
                        logger.debug(
                            f"No random coil 'CA' value for {res_name}. Skipping CSI calculation for this residue."
                        )

        logger.info("CSI calculation complete.")
        return csi_data

    except Exception as e:
        logger.error(f"An unexpected error occurred during CSI calculation: {e}", exc_info=True)
        raise