Branch:
lyskov/rosetta-ci:main
「revision: №1」
Test:
colab.none.root.ColabDesign
SubTest:
ColabDesign/af/examples/af_relax_design
SubTest files:
「file-system-view」
Daemon:
devel
State:
ColabDesign/af/examples/af_relax_design
Input Notebook: ColabDesign/af/examples/af_relax_design.ipynb
Output Notebook: /root/working_dir/ColabDesign_af_examples_af_relax_design.ipynb
Executing: 0% 0/13 [00:00<?, ?cell/s]Executing notebook with kernel: python3
Executing: 8% 1/13 [00:00<00:10, 1.12cell/s]
Executing: 23% 3/13 [00:09<00:32, 3.27s/cell]
Executing: 23% 3/13 [00:09<00:32, 3.21s/cell]
Traceback (most recent call last):
File "/usr/local/bin/papermill", line 8, in <module>
sys.exit(papermill())
File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/click/decorators.py", line 33, in new_func
return f(get_current_context(), *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/papermill/cli.py", line 235, in papermill
execute_notebook(
File "/usr/local/lib/python3.10/dist-packages/papermill/execute.py", line 131, in execute_notebook
raise_for_execution_errors(nb, output_path)
File "/usr/local/lib/python3.10/dist-packages/papermill/execute.py", line 251, in raise_for_execution_errors
raise error
papermill.exceptions.PapermillExecutionError:
---------------------------------------------------------------------------
Exception encountered at "In [1]":
ModuleNotFoundErrorTraceback (most recent call last)
<ipython-input-1-677ba16fe0a9> in <cell line: 46>()
44 import asyncio
45 import nest_asyncio
---> 46 from pyppeteer import launch
47 import base64
48
ModuleNotFoundError: No module named 'pyppeteer'
[NbConvertApp] Converting notebook /root/working_dir/ColabDesign_af_examples_af_relax_design.ipynb to html
[NbConvertApp] Writing 319482 bytes to /root/working_dir/ColabDesign_af_examples_af_relax_design.html
[NbConvertApp] Converting notebook /root/working_dir/ColabDesign_af_examples_af_relax_design.ipynb to asciidoc
/usr/local/lib/python3.10/dist-packages/nbconvert/utils/pandoc.py:51: RuntimeWarning: You are using an unsupported version of pandoc (2.9.2.1).
Your version must be at least (2.14.2) but less than (4.0.0).
Refer to https://pandoc.org/installing.html.
Continuing with doubts...
check_pandoc_version()
[NbConvertApp] Writing 9878 bytes to /root/working_dir/ColabDesign_af_examples_af_relax_design.asciidoc
----------------------------------------------------------------
An Exception was encountered at `In [1]'.
#af_relax_design (WIP)
*Efficient and scalable de novo protein design using a relaxed sequence
space*
Christopher Josef Frank, Ali Khoshouei, Yosta de Stigter, Dominik
Schiewitz, Shihao Feng, Sergey Ovchinnikov, Hendrik Dietz
doi: https://doi.org/10.1101/2023.02.24.529906
*WARNING* This notebook is in development, we are still working on
adding all the options from the manuscript above.
[#papermill-error-cell]#Execution using papermill encountered an
exception here and stopped:#
+*In[1]:*+
[source, ipython3]
----
#@title setup
import os
if not os.path.isdir("params"):
# get code
os.system("pip -q install pyppeteer nest_asyncio")
os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git")
# for debugging
os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign")
# download params
os.system("mkdir params")
os.system("apt-get install aria2 -qq")
os.system("aria2c -q -x 16 https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar")
os.system("tar -xf alphafold_params_2022-12-06.tar -C params")
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
from colabdesign import mk_afdesign_model, clear_mem
from colabdesign.mpnn import mk_mpnn_model
from IPython.display import HTML
from google.colab import files
import numpy as np
import requests, time
if not os.path.isfile("TMscore"):
os.system("wget -qnc https://zhanggroup.org/TM-score/TMscore.cpp")
os.system("g++ -static -O3 -ffast-math -lm -o TMscore TMscore.cpp")
def tmscore(x,y):
# pass to TMscore
output = os.popen(f'./TMscore {x} {y}')
# parse outputs
parse_float = lambda x: float(x.split("=")[1].split()[0])
o = {}
for line in output:
line = line.rstrip()
if line.startswith("RMSD"): o["rms"] = parse_float(line)
if line.startswith("TM-score"): o["tms"] = parse_float(line)
if line.startswith("GDT-TS-score"): o["gdt"] = parse_float(line)
return o
import asyncio
import nest_asyncio
from pyppeteer import launch
import base64
# Apply nest_asyncio to enable nested event loops
nest_asyncio.apply()
async def fetch_blob_content(page, blob_url):
blob_to_base64 = """
async (blobUrl) => {
const blob = await fetch(blobUrl).then(r => r.blob());
return new Promise((resolve) => {
const reader = new FileReader();
reader.onloadend = () => resolve(reader.result);
reader.readAsDataURL(blob);
});
}
"""
base64_data = await page.evaluate(blob_to_base64, blob_url)
_, encoded = base64_data.split(',', 1)
return base64.b64decode(encoded)
async def extract_pdb_file_download_link_and_content(url):
browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
page = await browser.newPage()
await page.goto(url, {'waitUntil': 'networkidle0'})
elements = await page.querySelectorAll('a.btn.bg-purple')
for element in elements:
href = await page.evaluate('(element) => element.getAttribute("href")', element)
if 'blob:https://esmatlas.com/' in href:
content = await fetch_blob_content(page, href)
await browser.close()
return href, content
await browser.close()
return "No PDB file link found.", None
def esmfold_api(sequence):
url = f'https://esmatlas.com/resources/fold/result?fasta_header=%3Eunnamed&sequence={sequence}'
result = asyncio.get_event_loop().run_until_complete(extract_pdb_file_download_link_and_content(url))
if result[1]:
pdb_str = result[1].decode('utf-8')
return pdb_str
else:
return "Failed to retrieve PDB content."
import jax
import jax.numpy as jnp
from colabdesign.af.alphafold.common import residue_constants
----
+*Out[1]:*+
----
ModuleNotFoundErrorTraceback (most recent call last)
<ipython-input-1-677ba16fe0a9> in <cell line: 46>()
44 import asyncio
45 import nest_asyncio
---> 46 from pyppeteer import launch
47 import base64
48
ModuleNotFoundError: No module named 'pyppeteer'
----
+*In[ ]:*+
[source, ipython3]
----
#@title # hallucination
#@markdown For a given length, generate/hallucinate a protein sequence that AlphaFold thinks folds into a well structured protein (high plddt, low pae, many contacts).
LENGTH = 100 #@param {type:"integer"}
COPIES = 1 #@param ["1", "2", "3", "4", "5", "6", "7", "8"] {type:"raw"}
MODE = "manuscript" #@param ["original", "manuscript"]
use_rg_loss = True #@param {type:"boolean"}
#@markdown ProteinMPNN Settings
use_mpnn_loss = False #@param {type:"boolean"}
use_solubleMPNN = False #@param {type:"boolean"}
#@markdown
def add_rg_loss(self, weight=0.1):
'''add radius of gyration loss'''
def loss_fn(inputs, outputs):
xyz = outputs["structure_module"]
ca = xyz["final_atom_positions"][:,residue_constants.atom_order["CA"]]
if self.protocol == "binder":
ca = ca[-self._binder_len:]
if MODE == "manuscript":
ca = ca[::5]
rg = jnp.sqrt(jnp.square(ca - ca.mean(0)).sum(-1).mean() + 1e-8)
if MODE == "original":
rg_th = 2.38 * ca.shape[0] ** 0.365
rg = jax.nn.elu(rg - rg_th)
return {"rg":rg}
self._callbacks["model"]["loss"].append(loss_fn)
self.opt["weights"]["rg"] = weight
def add_mpnn_loss(self, mpnn=0.1, mpnn_seq=0.0):
'''
add mpnn loss
mpnn = maximize confidence of proteinmpnn
mpnn_seq = push designed sequence to match proteinmpnn logits
'''
self._mpnn = mk_mpnn_model(weights = "soluble" if use_solubleMPNN else "original")
def loss_fn(inputs, outputs, aux, key):
# get structure
atom_idx = tuple(residue_constants.atom_order[k] for k in ["N","CA","C","O"])
I = {"S": inputs["aatype"],
"residue_idx": inputs["residue_index"],
"chain_idx": inputs["asym_id"],
"X": outputs["structure_module"]["final_atom_positions"][:,atom_idx],
"mask": outputs["structure_module"]["final_atom_mask"][:,1],
"lengths": self._lengths,
"key": key}
if "offset" in inputs:
I["offset"] = inputs["offset"]
# set autoregressive mask
L = sum(self._lengths)
if self.protocol == "binder":
I["ar_mask"] = 1 - np.eye(L)
I["ar_mask"][-self._len:,-self._len:] = 0
else:
I["ar_mask"] = np.zeros((L,L))
# get logits
logits = self._mpnn._score(**I)["logits"][:,:20]
if self.protocol == "binder":
logits = logits[-self._len:]
else:
logits = logits[:self._len]
aux["mpnn_logits"] = logits
# compute loss
log_q = jax.nn.log_softmax(logits)
p = inputs["seq"]["hard"]
q = jax.nn.softmax(logits)
losses = {}
losses["mpnn"] = -log_q.max(-1).mean()
losses["mpnn_seq"] = -(p * jax.lax.stop_gradient(log_q)).sum(-1).mean()
return losses
self._callbacks["model"]["loss"].append(loss_fn)
self.opt["weights"]["mpnn"] = mpnn
self.opt["weights"]["mpnn_seq"] = mpnn_seq
clear_mem()
af_model = mk_afdesign_model(protocol="hallucination")
af_model.prep_inputs(length=LENGTH, copies=COPIES)
# add extra losses
if use_rg_loss: add_rg_loss(af_model)
if use_mpnn_loss: add_mpnn_loss(af_model)
print("length",af_model._lengths)
print("weights",af_model.opt["weights"])
----
+*In[ ]:*+
[source, ipython3]
----
af_model.restart()
if MODE == "original":
# pre-design with gumbel initialization and softmax activation
af_model.set_weights(plddt=0.0, pae=0.0)
af_model.set_seq(mode=["gumbel"])
af_model.design_soft(50)
af_model.set_seq(af_model.aux["seq"]["pseudo"])
if MODE == "manuscript":
af_model.set_seq(mode=["gumbel","soft"])
af_model.set_weights(plddt=1.0, pae=1.0)
af_model.design_logits(40)
af_model.design_logits(10, save_best=True)
----
+*In[ ]:*+
[source, ipython3]
----
af_model.save_pdb(f"{af_model.protocol}.pdb")
af_model.plot_pdb()
----
+*In[ ]:*+
[source, ipython3]
----
HTML(af_model.animate())
----
+*In[ ]:*+
[source, ipython3]
----
af_model.get_seqs()
----
+*In[ ]:*+
[source, ipython3]
----
#@markdown #Redesign with ProteinMPNN
num_seqs = 8 #@param ["8", "16", "32", "64"] {type:"raw"}
mpnn_sampling_temp = 0.1 #@param ["0.0001", "0.1", "0.15", "0.2", "0.25", "0.3", "0.5", "1.0"] {type:"raw"}
rm_aa = "C" #@param {type:"string"}
use_solubleMPNN = False #@param {type:"boolean"}
#@markdown - `mpnn_sampling_temp` - control diversity of sampled sequences. (higher = more diverse).
#@markdown - `rm_aa='C'` - do not use [C]ysteines.
#@markdown - `use_solubleMPNN` - use weights trained only on soluble proteins. See [preprint](https://www.biorxiv.org/content/10.1101/2023.05.09.540044v2).
#@markdown
----
+*In[ ]:*+
[source, ipython3]
----
from colabdesign.shared.protein import alphabet_list as chain_list
mpnn_model = mk_mpnn_model()
mpnn_model.prep_inputs(pdb_filename=f"{af_model.protocol}.pdb",
chain=",".join(chain_list[:COPIES]),
homooligmer=COPIES>1,
rm_aa=rm_aa,
weights = "soluble" if use_solubleMPNN else"original")
out = mpnn_model.sample(num=num_seqs//8,
batch=8,
temperature=mpnn_sampling_temp)
for seq,score in zip(out["seq"],out["score"]):
print(score,seq.split("/")[0])
----
#Run ESMfold
+*In[ ]:*+
[source, ipython3]
----
print("# rmsd tmscore sequence")
best = {}
best_rmsd = None
for n,seq in enumerate(out["seq"]):
x = seq.split("/")[0]
with open(f"{af_model.protocol}.esmfold.{n}.pdb","w") as handle:
pdb_str = esmfold_api(x)
handle.write(pdb_str)
o = tmscore(f"{af_model.protocol}.pdb",
f"{af_model.protocol}.esmfold.{n}.pdb")
print(n,o["rms"],o["tms"],x)
if best_rmsd is None or o["rms"] < best_rmsd:
best_rmsd = o["rms"]
best = {**o,"seq":x}
----
+*In[ ]:*+
[source, ipython3]
----
best
----