This notebook now focuses on the visualization and simulation-preparation steps.
Sequence and UniProt/PDB exploration moved to docs/episodes/notebooks/01-introduction-sequence-check.ipynb.
Step 1: Interactive visualization (requires nglview)¶
In [ ]:
# Interactive visualization (requires nglview)
import os
from pathlib import Path
COURSE_DIR = Path(os.environ.get("COURSE_DIR", str(Path.home() / "Concepcion26"))).expanduser()
PROTEIN_PDB = COURSE_DIR / "data" / "complex" / "protein.pdb"
LIGAND_SDF = COURSE_DIR / "data" / "complex" / "ligand1.sdf"
try:
import nglview as nv
except ImportError as exc:
raise SystemExit(
"nglview is required for visualization. Install with:"
" conda install -c conda-forge nglview"
) from exc
view = nv.show_file(str(PROTEIN_PDB))
try:
view.add_component(str(LIGAND_SDF))
except Exception:
pass
view
NGLWidget()
Step 2: Generate parameter and initial coordinate files in AMBER format¶
import os¶
This cell executes: import os.
In [ ]:
import os
from collections import Counter
from pathlib import Path
import openmm as mm
import openmm.app as app
from openff.toolkit import Molecule
from openmm.app import ForceField, Modeller, PDBFile
from openmmforcefields.generators import SystemGenerator
COURSE_DIR = Path(os.environ.get("COURSE_DIR", str(Path.home() / "Concepcion26"))).expanduser()
DATA_DIR = COURSE_DIR / "data" / "complex"
PROTEIN_PDB = DATA_DIR / "protein.pdb"
LIGAND_SDF = DATA_DIR / "ligand1.sdf"
OUT_DIR = COURSE_DIR / "results" / "01-introduccion" / "complex"
OUT_DIR.mkdir(parents=True, exist_ok=True)
WATER_NAMES = {"HOH", "WAT", "SOL", "TIP3", "TIP3P"}
def summarize_topology(label, topology):
residues = list(topology.residues())
chains = list(topology.chains())
residue_counts = Counter(res.name for res in residues)
water_count = sum(1 for res in residues if res.name in WATER_NAMES)
print(f"{label}:")
print(" Atoms:", topology.getNumAtoms())
print(" Residues:", len(residues))
print(" Chains:", len(chains))
if water_count:
print(" Water residues:", water_count)
for name, count in residue_counts.most_common(10):
print(f" Residue {name}: {count}")
def resolve_ff19sb():
candidates = [
"amber/ff19SB.xml",
"amber14/ff19SB.xml",
"amber14/protein.ff19SB.xml",
]
for ff in candidates:
try:
ForceField(ff)
except Exception:
continue
print("Using protein force field:", ff)
return ff
data_dir = Path(app.__file__).resolve().parent / "data"
matches = sorted(p for p in data_dir.rglob("*.xml") if "ff19" in p.name.lower())
if matches:
ff_path = str(matches[0])
print("Using protein force field:", ff_path)
return ff_path
raise SystemExit(
"Could not locate ff19SB. Tried: " + ", ".join(candidates) + "\n"
"Install with: conda install -c conda-forge openmmforcefields ambertools"
)
def write_amber_files(topology, system, positions, out_dir, base_name):
try:
import parmed as pmd
except ImportError as exc:
raise SystemExit(
"Parmed is required to write Amber files. Install with:\n"
" conda install -c conda-forge parmed"
) from exc
structure = pmd.openmm.load_topology(topology, system, positions)
prmtop_path = out_dir / f"{base_name}.prmtop"
inpcrd_path = out_dir / f"{base_name}.inpcrd"
structure.save(str(prmtop_path), overwrite=True)
structure.save(str(inpcrd_path), overwrite=True)
print("Written:", prmtop_path)
print("Written:", inpcrd_path)
print("OpenMM", mm.__version__)
print("Protein PDB:", PROTEIN_PDB)
print("Ligand SDF:", LIGAND_SDF)
print("Output dir:", OUT_DIR)
protein = PDBFile(str(PROTEIN_PDB))
ligand = Molecule.from_file(str(LIGAND_SDF))
summarize_topology("Protein (input PDB)", protein.topology)
print("Ligand atoms:", ligand.n_atoms)
protein_ff = resolve_ff19sb()
protein_forcefield = ForceField(protein_ff)
protein_modeller = Modeller(protein.topology, protein.positions)
protein_modeller.addHydrogens(protein_forcefield)
summarize_topology("Protein (with H)", protein_modeller.topology)
ligand_top = ligand.to_topology()
modeller = Modeller(protein_modeller.topology, protein_modeller.positions)
modeller.add(ligand_top.to_openmm(), ligand_top.get_positions().to_openmm())
summarize_topology("Protein + ligand (combined)", modeller.topology)
system_generator = SystemGenerator(
forcefields=[protein_ff],
small_molecule_forcefield="gaff-2.11",
molecules=[ligand],
)
system = system_generator.create_system(modeller.topology, molecules=ligand)
write_amber_files(modeller.topology, system, modeller.positions, OUT_DIR, "protein_ligand")
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[22], line 7 5 import openmm as mm 6 import openmm.app as app ----> 7 from openff.toolkit import Molecule 8 from openmm.app import ForceField, Modeller, PDBFile 9 from openmmforcefields.generators import SystemGenerator ModuleNotFoundError: No module named 'openff'
Step 3: topology and input files for simulation¶
In [ ]:
import os
from collections import Counter
from pathlib import Path
import openmm as mm
import openmm.app as app
from openff.toolkit import Molecule
from openmm.app import ForceField, Modeller, PDBFile
from openmmforcefields.generators import SystemGenerator
COURSE_DIR = Path(os.environ.get("COURSE_DIR", str(Path.home() / "Concepcion26"))).expanduser()
DATA_DIR = COURSE_DIR / "data" / "complex"
PROTEIN_PDB = DATA_DIR / "protein.pdb"
LIGAND_SDF = DATA_DIR / "ligand1.sdf"
OUT_DIR = COURSE_DIR / "results" / "01-introduccion" / "complex"
OUT_DIR.mkdir(parents=True, exist_ok=True)
WATER_NAMES = {"HOH", "WAT", "SOL", "TIP3", "TIP3P"}
def summarize_topology(label, topology):
residues = list(topology.residues())
chains = list(topology.chains())
residue_counts = Counter(res.name for res in residues)
water_count = sum(1 for res in residues if res.name in WATER_NAMES)
print(f"{label}:")
print(" Atoms:", topology.getNumAtoms())
print(" Residues:", len(residues))
print(" Chains:", len(chains))
if water_count:
print(" Water residues:", water_count)
for name, count in residue_counts.most_common(10):
print(f" Residue {name}: {count}")
def resolve_ff19sb():
candidates = [
"amber/ff19SB.xml",
"amber14/ff19SB.xml",
"amber14/protein.ff19SB.xml",
]
for ff in candidates:
try:
ForceField(ff)
except Exception:
continue
print("Using protein force field:", ff)
return ff
data_dir = Path(app.__file__).resolve().parent / "data"
matches = sorted(p for p in data_dir.rglob("*.xml") if "ff19" in p.name.lower())
if matches:
ff_path = str(matches[0])
print("Using protein force field:", ff_path)
return ff_path
raise SystemExit(
"Could not locate ff19SB. Tried: " + ", ".join(candidates) + "\n"
"Install with: conda install -c conda-forge openmmforcefields ambertools"
)
def write_amber_files(topology, system, positions, out_dir, base_name):
try:
import parmed as pmd
except ImportError as exc:
raise SystemExit(
"Parmed is required to write Amber files. Install with:\n"
" conda install -c conda-forge parmed"
) from exc
structure = pmd.openmm.load_topology(topology, system, positions)
prmtop_path = out_dir / f"{base_name}.prmtop"
inpcrd_path = out_dir / f"{base_name}.inpcrd"
structure.save(str(prmtop_path), overwrite=True)
structure.save(str(inpcrd_path), overwrite=True)
print("Written:", prmtop_path)
print("Written:", inpcrd_path)
print("OpenMM", mm.__version__)
print("Protein PDB:", PROTEIN_PDB)
print("Ligand SDF:", LIGAND_SDF)
print("Output dir:", OUT_DIR)
protein = PDBFile(str(PROTEIN_PDB))
ligand = Molecule.from_file(str(LIGAND_SDF))
summarize_topology("Protein (input PDB)", protein.topology)
print("Ligand atoms:", ligand.n_atoms)
protein_ff = resolve_ff19sb()
protein_forcefield = ForceField(protein_ff)
protein_modeller = Modeller(protein.topology, protein.positions)
protein_modeller.addHydrogens(protein_forcefield)
summarize_topology("Protein (with H)", protein_modeller.topology)
ligand_top = ligand.to_topology()
modeller = Modeller(protein_modeller.topology, protein_modeller.positions)
modeller.add(ligand_top.to_openmm(), ligand_top.get_positions().to_openmm())
summarize_topology("Protein + ligand (combined)", modeller.topology)
system_generator = SystemGenerator(
forcefields=[protein_ff],
small_molecule_forcefield="gaff-2.11",
molecules=[ligand],
)
system = system_generator.create_system(modeller.topology, molecules=ligand)
write_amber_files(modeller.topology, system, modeller.positions, OUT_DIR, "protein_ligand")
OpenMM 8.4 Protein PDB: /Users/jordivilla/Concepcion26/data/complex/protein.pdb Ligand SDF: /Users/jordivilla/Concepcion26/data/complex/ligand1.sdf Output dir: /Users/jordivilla/Concepcion26/results/01-introduccion/complex Protein (input PDB): Atoms: 4645 Residues: 304 Chains: 1 Residue LEU: 29 Residue VAL: 27 Residue GLY: 26 Residue THR: 24 Residue ASN: 21 Residue ALA: 17 Residue ASP: 17 Residue SER: 16 Residue PHE: 16 Residue PRO: 13 Ligand atoms: 21 Using protein force field: /opt/miniconda3/envs/md-openmm/lib/python3.10/site-packages/openmm/app/data/amber19/protein.ff19SB.xml Protein (with H): Atoms: 4645 Residues: 304 Chains: 1 Residue LEU: 29 Residue VAL: 27 Residue GLY: 26 Residue THR: 24 Residue ASN: 21 Residue ALA: 17 Residue ASP: 17 Residue SER: 16 Residue PHE: 16 Residue PRO: 13 Protein + ligand (combined): Atoms: 4666 Residues: 305 Chains: 2 Residue LEU: 29 Residue VAL: 27 Residue GLY: 26 Residue THR: 24 Residue ASN: 21 Residue ALA: 17 Residue ASP: 17 Residue SER: 16 Residue PHE: 16 Residue PRO: 13 Written: /Users/jordivilla/Concepcion26/results/01-introduccion/complex/protein_ligand.prmtop Written: /Users/jordivilla/Concepcion26/results/01-introduccion/complex/protein_ligand.inpcrd