SubTest ColabDesign.ColabDesign/af/examples/af_relax

Branch: lyskov/rosetta-ci:main 「revision: №1」
Test: colab.none.root.ColabDesign
SubTest: ColabDesign/af/examples/af_relax_design
SubTest files: 「file-system-view」
Daemon: devel
State: ColabDesign/af/examples/af_relax_design

Input Notebook: ColabDesign/af/examples/af_relax_design.ipynb Output Notebook: /root/working_dir/ColabDesign_af_examples_af_relax_design.ipynb Executing: 0% 0/13 [00:00<?, ?cell/s]Executing notebook with kernel: python3 Executing: 8% 1/13 [00:00<00:10, 1.12cell/s] Executing: 23% 3/13 [00:09<00:32, 3.27s/cell] Executing: 23% 3/13 [00:09<00:32, 3.21s/cell] Traceback (most recent call last): File "/usr/local/bin/papermill", line 8, in <module> sys.exit(papermill()) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__ return self.main(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1078, in main rv = self.invoke(ctx) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1434, in invoke return ctx.invoke(self.callback, **ctx.params) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 783, in invoke return __callback(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/click/decorators.py", line 33, in new_func return f(get_current_context(), *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/papermill/cli.py", line 235, in papermill execute_notebook( File "/usr/local/lib/python3.10/dist-packages/papermill/execute.py", line 131, in execute_notebook raise_for_execution_errors(nb, output_path) File "/usr/local/lib/python3.10/dist-packages/papermill/execute.py", line 251, in raise_for_execution_errors raise error papermill.exceptions.PapermillExecutionError: --------------------------------------------------------------------------- Exception encountered at "In [1]": ModuleNotFoundErrorTraceback (most recent call last) <ipython-input-1-677ba16fe0a9> in <cell line: 46>() 44 import asyncio 45 import nest_asyncio ---> 46 from pyppeteer import launch 47 import base64 48 ModuleNotFoundError: No module named 'pyppeteer' [NbConvertApp] Converting notebook /root/working_dir/ColabDesign_af_examples_af_relax_design.ipynb to html [NbConvertApp] Writing 319482 bytes to /root/working_dir/ColabDesign_af_examples_af_relax_design.html [NbConvertApp] Converting notebook /root/working_dir/ColabDesign_af_examples_af_relax_design.ipynb to asciidoc /usr/local/lib/python3.10/dist-packages/nbconvert/utils/pandoc.py:51: RuntimeWarning: You are using an unsupported version of pandoc (2.9.2.1). Your version must be at least (2.14.2) but less than (4.0.0). Refer to https://pandoc.org/installing.html. Continuing with doubts... check_pandoc_version() [NbConvertApp] Writing 9878 bytes to /root/working_dir/ColabDesign_af_examples_af_relax_design.asciidoc ---------------------------------------------------------------- An Exception was encountered at `In [1]'. #af_relax_design (WIP) *Efficient and scalable de novo protein design using a relaxed sequence space* Christopher Josef Frank, Ali Khoshouei, Yosta de Stigter, Dominik Schiewitz, Shihao Feng, Sergey Ovchinnikov, Hendrik Dietz doi: https://doi.org/10.1101/2023.02.24.529906 *WARNING* This notebook is in development, we are still working on adding all the options from the manuscript above. [#papermill-error-cell]#Execution using papermill encountered an exception here and stopped:# +*In[1]:*+ [source, ipython3] ---- #@title setup import os if not os.path.isdir("params"): # get code os.system("pip -q install pyppeteer nest_asyncio") os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git") # for debugging os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign") # download params os.system("mkdir params") os.system("apt-get install aria2 -qq") os.system("aria2c -q -x 16 https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar") os.system("tar -xf alphafold_params_2022-12-06.tar -C params") import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import os from colabdesign import mk_afdesign_model, clear_mem from colabdesign.mpnn import mk_mpnn_model from IPython.display import HTML from google.colab import files import numpy as np import requests, time if not os.path.isfile("TMscore"): os.system("wget -qnc https://zhanggroup.org/TM-score/TMscore.cpp") os.system("g++ -static -O3 -ffast-math -lm -o TMscore TMscore.cpp") def tmscore(x,y): # pass to TMscore output = os.popen(f'./TMscore {x} {y}') # parse outputs parse_float = lambda x: float(x.split("=")[1].split()[0]) o = {} for line in output: line = line.rstrip() if line.startswith("RMSD"): o["rms"] = parse_float(line) if line.startswith("TM-score"): o["tms"] = parse_float(line) if line.startswith("GDT-TS-score"): o["gdt"] = parse_float(line) return o import asyncio import nest_asyncio from pyppeteer import launch import base64 # Apply nest_asyncio to enable nested event loops nest_asyncio.apply() async def fetch_blob_content(page, blob_url): blob_to_base64 = """ async (blobUrl) => { const blob = await fetch(blobUrl).then(r => r.blob()); return new Promise((resolve) => { const reader = new FileReader(); reader.onloadend = () => resolve(reader.result); reader.readAsDataURL(blob); }); } """ base64_data = await page.evaluate(blob_to_base64, blob_url) _, encoded = base64_data.split(',', 1) return base64.b64decode(encoded) async def extract_pdb_file_download_link_and_content(url): browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) page = await browser.newPage() await page.goto(url, {'waitUntil': 'networkidle0'}) elements = await page.querySelectorAll('a.btn.bg-purple') for element in elements: href = await page.evaluate('(element) => element.getAttribute("href")', element) if 'blob:https://esmatlas.com/' in href: content = await fetch_blob_content(page, href) await browser.close() return href, content await browser.close() return "No PDB file link found.", None def esmfold_api(sequence): url = f'https://esmatlas.com/resources/fold/result?fasta_header=%3Eunnamed&sequence={sequence}' result = asyncio.get_event_loop().run_until_complete(extract_pdb_file_download_link_and_content(url)) if result[1]: pdb_str = result[1].decode('utf-8') return pdb_str else: return "Failed to retrieve PDB content." import jax import jax.numpy as jnp from colabdesign.af.alphafold.common import residue_constants ---- +*Out[1]:*+ ---- ModuleNotFoundErrorTraceback (most recent call last) <ipython-input-1-677ba16fe0a9> in <cell line: 46>() 44 import asyncio 45 import nest_asyncio ---> 46 from pyppeteer import launch 47 import base64 48 ModuleNotFoundError: No module named 'pyppeteer' ---- +*In[ ]:*+ [source, ipython3] ---- #@title # hallucination #@markdown For a given length, generate/hallucinate a protein sequence that AlphaFold thinks folds into a well structured protein (high plddt, low pae, many contacts). LENGTH = 100 #@param {type:"integer"} COPIES = 1 #@param ["1", "2", "3", "4", "5", "6", "7", "8"] {type:"raw"} MODE = "manuscript" #@param ["original", "manuscript"] use_rg_loss = True #@param {type:"boolean"} #@markdown ProteinMPNN Settings use_mpnn_loss = False #@param {type:"boolean"} use_solubleMPNN = False #@param {type:"boolean"} #@markdown def add_rg_loss(self, weight=0.1): '''add radius of gyration loss''' def loss_fn(inputs, outputs): xyz = outputs["structure_module"] ca = xyz["final_atom_positions"][:,residue_constants.atom_order["CA"]] if self.protocol == "binder": ca = ca[-self._binder_len:] if MODE == "manuscript": ca = ca[::5] rg = jnp.sqrt(jnp.square(ca - ca.mean(0)).sum(-1).mean() + 1e-8) if MODE == "original": rg_th = 2.38 * ca.shape[0] ** 0.365 rg = jax.nn.elu(rg - rg_th) return {"rg":rg} self._callbacks["model"]["loss"].append(loss_fn) self.opt["weights"]["rg"] = weight def add_mpnn_loss(self, mpnn=0.1, mpnn_seq=0.0): ''' add mpnn loss mpnn = maximize confidence of proteinmpnn mpnn_seq = push designed sequence to match proteinmpnn logits ''' self._mpnn = mk_mpnn_model(weights = "soluble" if use_solubleMPNN else "original") def loss_fn(inputs, outputs, aux, key): # get structure atom_idx = tuple(residue_constants.atom_order[k] for k in ["N","CA","C","O"]) I = {"S": inputs["aatype"], "residue_idx": inputs["residue_index"], "chain_idx": inputs["asym_id"], "X": outputs["structure_module"]["final_atom_positions"][:,atom_idx], "mask": outputs["structure_module"]["final_atom_mask"][:,1], "lengths": self._lengths, "key": key} if "offset" in inputs: I["offset"] = inputs["offset"] # set autoregressive mask L = sum(self._lengths) if self.protocol == "binder": I["ar_mask"] = 1 - np.eye(L) I["ar_mask"][-self._len:,-self._len:] = 0 else: I["ar_mask"] = np.zeros((L,L)) # get logits logits = self._mpnn._score(**I)["logits"][:,:20] if self.protocol == "binder": logits = logits[-self._len:] else: logits = logits[:self._len] aux["mpnn_logits"] = logits # compute loss log_q = jax.nn.log_softmax(logits) p = inputs["seq"]["hard"] q = jax.nn.softmax(logits) losses = {} losses["mpnn"] = -log_q.max(-1).mean() losses["mpnn_seq"] = -(p * jax.lax.stop_gradient(log_q)).sum(-1).mean() return losses self._callbacks["model"]["loss"].append(loss_fn) self.opt["weights"]["mpnn"] = mpnn self.opt["weights"]["mpnn_seq"] = mpnn_seq clear_mem() af_model = mk_afdesign_model(protocol="hallucination") af_model.prep_inputs(length=LENGTH, copies=COPIES) # add extra losses if use_rg_loss: add_rg_loss(af_model) if use_mpnn_loss: add_mpnn_loss(af_model) print("length",af_model._lengths) print("weights",af_model.opt["weights"]) ---- +*In[ ]:*+ [source, ipython3] ---- af_model.restart() if MODE == "original": # pre-design with gumbel initialization and softmax activation af_model.set_weights(plddt=0.0, pae=0.0) af_model.set_seq(mode=["gumbel"]) af_model.design_soft(50) af_model.set_seq(af_model.aux["seq"]["pseudo"]) if MODE == "manuscript": af_model.set_seq(mode=["gumbel","soft"]) af_model.set_weights(plddt=1.0, pae=1.0) af_model.design_logits(40) af_model.design_logits(10, save_best=True) ---- +*In[ ]:*+ [source, ipython3] ---- af_model.save_pdb(f"{af_model.protocol}.pdb") af_model.plot_pdb() ---- +*In[ ]:*+ [source, ipython3] ---- HTML(af_model.animate()) ---- +*In[ ]:*+ [source, ipython3] ---- af_model.get_seqs() ---- +*In[ ]:*+ [source, ipython3] ---- #@markdown #Redesign with ProteinMPNN num_seqs = 8 #@param ["8", "16", "32", "64"] {type:"raw"} mpnn_sampling_temp = 0.1 #@param ["0.0001", "0.1", "0.15", "0.2", "0.25", "0.3", "0.5", "1.0"] {type:"raw"} rm_aa = "C" #@param {type:"string"} use_solubleMPNN = False #@param {type:"boolean"} #@markdown - `mpnn_sampling_temp` - control diversity of sampled sequences. (higher = more diverse). #@markdown - `rm_aa='C'` - do not use [C]ysteines. #@markdown - `use_solubleMPNN` - use weights trained only on soluble proteins. See [preprint](https://www.biorxiv.org/content/10.1101/2023.05.09.540044v2). #@markdown ---- +*In[ ]:*+ [source, ipython3] ---- from colabdesign.shared.protein import alphabet_list as chain_list mpnn_model = mk_mpnn_model() mpnn_model.prep_inputs(pdb_filename=f"{af_model.protocol}.pdb", chain=",".join(chain_list[:COPIES]), homooligmer=COPIES>1, rm_aa=rm_aa, weights = "soluble" if use_solubleMPNN else"original") out = mpnn_model.sample(num=num_seqs//8, batch=8, temperature=mpnn_sampling_temp) for seq,score in zip(out["seq"],out["score"]): print(score,seq.split("/")[0]) ---- #Run ESMfold +*In[ ]:*+ [source, ipython3] ---- print("# rmsd tmscore sequence") best = {} best_rmsd = None for n,seq in enumerate(out["seq"]): x = seq.split("/")[0] with open(f"{af_model.protocol}.esmfold.{n}.pdb","w") as handle: pdb_str = esmfold_api(x) handle.write(pdb_str) o = tmscore(f"{af_model.protocol}.pdb", f"{af_model.protocol}.esmfold.{n}.pdb") print(n,o["rms"],o["tms"],x) if best_rmsd is None or o["rms"] < best_rmsd: best_rmsd = o["rms"] best = {**o,"seq":x} ---- +*In[ ]:*+ [source, ipython3] ---- best ----

ColabDesign/af/examples/af_relax_design sub-test description