Branch: lyskov/rosetta-ci:commits 「revision: №1」
Test: colab.none.root.ColabDesign
SubTest: ColabDesign/af/examples/af_pseudo_diffusion_dgram
SubTest files: 「file-system-view」
Daemon: devel    
State: ColabDesign/af/examples/af_pseudo_diffusion_dgram

Input Notebook: ColabDesign/af/examples/af_pseudo_diffusion_dgram.ipynb Output Notebook: /root/working_dir/ColabDesign_af_examples_af_pseudo_diffusion_dgram.ipynb Executing: 0% 0/9 [00:00<?, ?cell/s]Executing notebook with kernel: python3 Executing: 11% 1/9 [00:00<00:07, 1.12cell/s] Executing: 33% 3/9 [00:01<00:03, 1.64cell/s] Executing: 44% 4/9 [00:02<00:02, 2.19cell/s] Executing: 56% 5/9 [00:02<00:01, 2.42cell/s] Executing: 56% 5/9 [00:03<00:02, 1.65cell/s] Traceback (most recent call last): File "/usr/local/bin/papermill", line 8, in <module> sys.exit(papermill()) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1157, in __call__ return self.main(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1078, in main rv = self.invoke(ctx) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 1434, in invoke return ctx.invoke(self.callback, **ctx.params) File "/usr/local/lib/python3.10/dist-packages/click/core.py", line 783, in invoke return __callback(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/click/decorators.py", line 33, in new_func return f(get_current_context(), *args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/papermill/cli.py", line 235, in papermill execute_notebook( File "/usr/local/lib/python3.10/dist-packages/papermill/execute.py", line 131, in execute_notebook raise_for_execution_errors(nb, output_path) File "/usr/local/lib/python3.10/dist-packages/papermill/execute.py", line 251, in raise_for_execution_errors raise error papermill.exceptions.PapermillExecutionError: --------------------------------------------------------------------------- Exception encountered at "In [3]": AssertionErrorTraceback (most recent call last) <ipython-input-3-175fd96880d9> in <cell line: 38>() 70 71 # denoise ---> 72 aux = af_model.predict(return_aux=True, verbose=False, 73 sample_models=sample_models, 74 dropout=use_dropout, num_recycles=num_recycles) ~/working_dir/colabdesign/af/design.py in predict(self, seq, bias, num_models, num_recycles, models, sample_models, dropout, hard, soft, temp, return_aux, verbose, seed, **kwargs) 299 300 # run --> 301 self.run(num_recycles=num_recycles, num_models=num_models, 302 sample_models=sample_models, models=models, backprop=False, **kwargs) 303 if verbose: self._print_log("predict") ~/working_dir/colabdesign/af/design.py in run(self, num_recycles, num_models, sample_models, models, backprop, callback, model_nums, return_aux) 88 if model_nums is None: 89 model_nums = self._get_model_nums(num_models, sample_models, models) ---> 90 assert len(model_nums) > 0, "ERROR: no model params defined" 91 92 # loop through model params AssertionError: ERROR: no model params defined [NbConvertApp] Converting notebook /root/working_dir/ColabDesign_af_examples_af_pseudo_diffusion_dgram.ipynb to html [NbConvertApp] Writing 328046 bytes to /root/working_dir/ColabDesign_af_examples_af_pseudo_diffusion_dgram.html [NbConvertApp] Converting notebook /root/working_dir/ColabDesign_af_examples_af_pseudo_diffusion_dgram.ipynb to asciidoc /usr/local/lib/python3.10/dist-packages/nbconvert/utils/pandoc.py:51: RuntimeWarning: You are using an unsupported version of pandoc (2.9.2.1). Your version must be at least (2.14.2) but less than (4.0.0). Refer to https://pandoc.org/installing.html. Continuing with doubts... check_pandoc_version() [NbConvertApp] Writing 11735 bytes to /root/working_dir/ColabDesign_af_examples_af_pseudo_diffusion_dgram.asciidoc ---------------------------------------------------------------- An Exception was encountered at `In [3]'. #AF_pseudo_diffusion + proteinMPNN Hacking AlphaFold to be a diffusion model (for backbone generation) via distogram. At each step add logits from proteinMPNN. *WARNING*: This notebook is experimental, designed as a control. Not intended for practical use at this stage. +*In[1]:*+ [source, ipython3] ---- #@title setup import os if not os.path.isdir("params"): # get code os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git@v1.1.1") # for debugging os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign") # download params os.system("mkdir params") os.system("apt-get install aria2 -qq") os.system("aria2c -q -x 16 https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar") os.system("tar -xf alphafold_params_2022-12-06.tar -C params") import warnings warnings.simplefilter(action='ignore', category=FutureWarning) import os, re from colabdesign import mk_afdesign_model, clear_mem from colabdesign.mpnn import mk_mpnn_model from colabdesign.af.alphafold.common import residue_constants from colabdesign.shared.protein import _np_get_cb from IPython.display import HTML from google.colab import files import numpy as np import jax.numpy as jnp import jax from scipy.special import softmax import tqdm.notebook TQDM_BAR_FORMAT = '{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]' def get_pdb(pdb_code=""): if pdb_code is None or pdb_code == "": upload_dict = files.upload() pdb_string = upload_dict[list(upload_dict.keys())[0]] with open("tmp.pdb","wb") as out: out.write(pdb_string) return "tmp.pdb" elif os.path.isfile(pdb_code): return pdb_code elif len(pdb_code) == 4: os.system(f"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb") return f"{pdb_code}.pdb" else: os.system(f"wget -qnc https://alphafold.ebi.ac.uk/files/AF-{pdb_code}-F1-model_v3.pdb") return f"AF-{pdb_code}-F1-model_v3.pdb" def sample_gumbel(shape, eps=1e-20): """Sample from Gumbel(0, 1)""" U = np.random.uniform(size=shape) return -np.log(-np.log(U + eps) + eps) ---- +*In[2]:*+ [source, ipython3] ---- #@title initialize the model length = 100 #@param {type:"integer"} #symmetry = "C" #@param ["C"] #copies = 1 #@param {type:"integer"} #@markdown Provide a starting point (optional) starting_seq = "" #@param {type:"string"} starting_seq = re.sub("[^A-Z]", "", starting_seq.upper()) #@markdown - if `starting_seq` provided the `length` option will be overwritten by length of starting sequence. #@markdown Experimental options use_multimer = False mode = "dgram_retrain" #@param ["dgram","dgram_retrain"] ##@markdown - `xyz` - use structure output as template input #@markdown - `dgram` - use distogram output as template input #@markdown - `dgram_retrain` - replace distogram head from AlphaFold with one retrained to map output bins to template bins. if len(starting_seq) > 0: length = len(starting_seq) clear_mem() af_model = mk_afdesign_model(protocol="hallucination", use_templates=True, debug=True, use_multimer=use_multimer) af_model.prep_inputs(length=length) mpnn_model = mk_mpnn_model() print("lengths",af_model._lengths) if "dgram" in mode: if "retrain" in mode and not use_multimer: # update distogram head to return all 39 bins af_model._cfg.model.heads.distogram.first_break = 3.25 af_model._cfg.model.heads.distogram.last_break = 50.75 af_model._cfg.model.heads.distogram.num_bins = 39 af_model._model = af_model._get_model(af_model._cfg) from colabdesign.af.weights import __file__ as af_path template_dgram_head = np.load(os.path.join(os.path.dirname(af_path),'template_dgram_head.npy')) for k in range(len(af_model._model_params)): params = {"weights":jnp.array(template_dgram_head[k]),"bias":jnp.zeros(39)} af_model._model_params[k]["alphafold/alphafold_iteration/distogram_head/half_logits"] = params else: dgram_map = np.eye(39)[np.repeat(np.append(0,np.arange(15)),4)] dgram_map[-1,:] = 0 def get_dgram(positions, num_bins=39, min_bin=3.25, max_bin=50.75): atom_idx = residue_constants.atom_order atoms = {k:positions[...,atom_idx[k],:] for k in ["N","CA","C"]} cb = _np_get_cb(**atoms, use_jax=False) dist2 = np.square(cb[None,:] - cb[:,None]).sum(-1,keepdims=True) lower_breaks = np.linspace(min_bin, max_bin, num_bins) lower_breaks = np.square(lower_breaks) upper_breaks = np.concatenate([lower_breaks[1:],np.array([1e8], dtype=jnp.float32)], axis=-1) return ((dist2 > lower_breaks) * (dist2 < upper_breaks)).astype(float) ---- +*Out[2]:*+ ---- WARNING: 'model_1_ptm' not found WARNING: 'model_2_ptm' not found lengths [100] ---- [#papermill-error-cell]#Execution using papermill encountered an exception here and stopped:# +*In[3]:*+ [source, ipython3] ---- #@title run protocol #@markdown Optimization options iterations = 100 #@param ["50", "100"] {type:"raw"} use_dgram_noise = True #@param {type:"boolean"} use_seq_noise = True #@param {type:"boolean"} use_dropout = True #@param {type:"boolean"} seqsep_mask = 6 #@param {type:"integer"} #@markdown AlphaFold options sample_models = True #@param {type:"boolean"} num_recycles = 0 #@param ["0", "1", "2", "3"] {type:"raw"} #@markdown proteinMPNN options (set to `none` to disable) mpnn_mode = "conditional" #@param ["none","sample", "conditional", "unconditional"] L = sum(af_model._lengths) af_model.restart(mode="gumbel") af_model._inputs["rm_template_seq"] = True # gather info about inputs if "offset" in af_model._inputs: offset = af_model._inputs else: idx = af_model._inputs["residue_index"] offset = idx[:,None] - idx[None,:] # initialize sequence if len(starting_seq) > 1: af_model.set_seq(seq=starting_seq) af_model._inputs["bias"] = np.zeros((L,20)) # initialize coordinates/dgram af_model._inputs["batch"] = {"aatype":np.zeros(L).astype(int), "all_atom_mask":np.zeros((L,37)), "all_atom_positions":np.zeros((L,37,3)), "dgram":np.zeros((L,L,39))} for k in range(iterations): # disable stochastic part for the last 10 steps if k > (iterations - 10): use_dropout = False sample_models = False use_seq_noise = False seqsep_mask = 0.0 # noise if k > 0: dgram_xyz = get_dgram(xyz) dgram_prob = softmax(dgram_logits,-1) if use_seq_noise: af_model._inputs["bias"] = 0.1 * sample_gumbel((L,20)) if mode == "xyz": dgram = dgram_xyz if mode == "dgram": dgram = dgram_prob @ dgram_map dgram[...,14:] = dgram_xyz[...,14:] * dgram_prob[...,-1:] if mode == "dgram_retrain": dgram = dgram_prob if use_dgram_noise: noise = sample_gumbel(dgram.shape) * (1 - k/iterations) dgram = softmax(np.log(dgram + 1e-8) + noise, -1) # add mask to avoid local contacts being fixed (otherwise there is a bias toward helix) mask = np.abs(offset) > seqsep_mask af_model._inputs["batch"]["dgram"] = dgram * mask[:,:,None] # denoise aux = af_model.predict(return_aux=True, verbose=False, sample_models=sample_models, dropout=use_dropout, num_recycles=num_recycles) plddt = aux["plddt"] seq = aux["seq"]["hard"][0].argmax(-1) xyz = aux["atom_positions"].copy() dgram_logits = aux["debug"]["outputs"]["distogram"]["logits"] # update inputs af_model._inputs["batch"]["aatype"] = seq af_model._inputs["batch"]["all_atom_mask"][:,:4] = np.sqrt(plddt)[:,None] af_model._inputs["batch"]["all_atom_positions"] = xyz # add logits from proteinmpnn at each stage if mpnn_mode != "none": mpnn_model.get_af_inputs(af_model) if mpnn_mode == "sample": mpnn_out = mpnn_model.sample(temp = 1-k/iterations) mpnn_logits = mpnn_out["logits"][0,:,:20] aux["log"]["mpnn"] = mpnn_out["score"][0] else: opt = {} if mpnn_mode == "conditional" else {"ar_mask":np.zeros((L,L))} mpnn_out = mpnn_model.score(**opt) mpnn_logits = mpnn_out["logits"][:,:20] aux["log"]["mpnn"] = mpnn_out["score"] beta = np.square(k/iterations) * plddt[:,None] af_model._params["seq"] = (1-beta) * af_model._params["seq"] + beta * mpnn_logits # save results af_model._save_results(aux) af_model._k += 1 ---- +*Out[3]:*+ ---- AssertionErrorTraceback (most recent call last) <ipython-input-3-175fd96880d9> in <cell line: 38>() 70 71 # denoise ---> 72 aux = af_model.predict(return_aux=True, verbose=False, 73 sample_models=sample_models, 74 dropout=use_dropout, num_recycles=num_recycles) ~/working_dir/colabdesign/af/design.py in predict(self, seq, bias, num_models, num_recycles, models, sample_models, dropout, hard, soft, temp, return_aux, verbose, seed, **kwargs) 299 300 # run --> 301 self.run(num_recycles=num_recycles, num_models=num_models, 302 sample_models=sample_models, models=models, backprop=False, **kwargs) 303 if verbose: self._print_log("predict") ~/working_dir/colabdesign/af/design.py in run(self, num_recycles, num_models, sample_models, models, backprop, callback, model_nums, return_aux) 88 if model_nums is None: 89 model_nums = self._get_model_nums(num_models, sample_models, models) ---> 90 assert len(model_nums) > 0, "ERROR: no model params defined" 91 92 # loop through model params AssertionError: ERROR: no model params defined ---- +*In[ ]:*+ [source, ipython3] ---- af_model.save_pdb("0.pdb") af_model.plot_pdb() af_model.get_seqs() ---- +*In[ ]:*+ [source, ipython3] ---- HTML(af_model.animate(dpi=100)) ---- +*In[ ]:*+ [source, ipython3] ---- #@title sample new sequences using proteinMPNN and rescore with alphafold (w/o template) #@markdown #### Design Options num_seqs = 32 #@param ["32", "64", "128", "256", "512", "1024"] {type:"raw"} sampling_temp = 0.1 import pandas as pd from google.colab import data_table data_table.enable_dataframe_formatter() # zero out template inputs out = mpnn_model.sample(num=num_seqs//32, batch=32, temperature=sampling_temp) af_terms = ["plddt","ptm","pae"] for k in af_terms: out[k] = [] os.system("mkdir -p output/all_pdb") af_model._inputs["batch"]["dgram"] = np.zeros((L,L,39)) with tqdm.notebook.tqdm(total=out["S"].shape[0], bar_format=TQDM_BAR_FORMAT) as pbar: with open("design.fasta","w") as fasta: for n in range(num_seqs): seq = out["seq"][n] af_model.predict(seq=seq, num_recycles=1, num_models=1, verbose=False) for t in af_terms: out[t].append(af_model.aux["log"][t]) out["pae"][-1] = out["pae"][-1] * 31 af_model._save_results(save_best=True, verbose=False) af_model.save_current_pdb(f"output/all_pdb/n{n}.pdb") af_model._k += 1 line = f'>mpnn:{out["score"][n]:.3f}_plddt:{out["plddt"][n]:.3f}_ptm:{out["ptm"][n]:.3f}_pae:{out["pae"][n]:.3f}\n{out["seq"][n]}' fasta.write(line+"\n") pbar.update(1) labels = ["score"] + af_terms + ["seq"] data = [[out[k][n] for k in labels] for n in range(num_seqs)] labels[0] = "mpnn" df = pd.DataFrame(data, columns=labels) df.to_csv('output/mpnn_results.csv') data_table.DataTable(df.round(3).sort_values("pae")) ---- +*In[ ]:*+ [source, ipython3] ---- af_model.save_pdb("1.pdb") af_model.plot_pdb() af_model.get_seqs() ----