#!/usr/bin/python
# This file is part of ModPipe, Copyright 1997-2020 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function, division
from modeller import *
from optparse import OptionParser
import modpipe.version
import modpipe.pdbutils
import shutil
import sys, os, re

sys.argv[0] = 'modpipe make_chains'

def main():

    # Parse command line options
    parser = OptionParser(version=modpipe.version.message())

    # Set defaults
    parser.set_usage("""
 This script takes a file with a list of PDB filenames (of the form
 pdb1abc.ent) and splits each file into individual chains that match
 some specified criteria.  It can optionally write all the sequences
 (from all the files) into a single file.

 Usage: %prog [options] pdblist

 pdblist is a file containing names of PDB files to process.

 Run `%prog -h` for help information
 """)

    # Populate options list
    pdb = modpipe.pdbutils.get_pdb_repository(include_local=True)
    parser.add_option("-p", "--pdb_repository",
                 dest="pdbrep",
                 help="""Directory containing PDB files. The default
                      value is """ + str(pdb),
                 default=pdb, metavar="DIR")
    parser.add_option("-o", "--output_file",
                 dest="outfile",
                 help="""Append sequences to FILE. The default option
                      (when option is not specified) is to create individual
                      files for each chain. If set to -, the sequences will
                      be written to standard output.""",
                 default=None,
                 metavar="FILE")
    parser.add_option("-f", "--file_format",
                 dest="fileformat",
                 help="""Format of output sequence files. PIR | FASTA
                 (default: 'PIR')""",
                 default='PIR',
                 metavar="FORMAT")
    parser.add_option("-s", "--structure_type",
                 dest="structype",
                 help="""Add Structure Type (e.g. structureE)""",
                 default='')

    # Check mandatory options
    opts, args = parser.parse_args()
    structype="structureX structureN"
    if opts.structype:
        if opts.structype == "structureE":
            structype +=" structureE"


    if len(args) != 1:
        parser.error("You must specify a file containing PDB file names")
    pdblist = args[0]

    fhout=None
    if opts.outfile:
        if opts.outfile == '-':
            fhout=sys.stdout
        else:
            fhout=open(opts.outfile, "a")

    # -- Open file containing list of PDB files
    f=open(pdblist, "r")

    # -- Initialize some modeller stuff
    env = Environ()
    env.io.atom_files_directory = opts.pdbrep

    chain = ''
    for n, pdb in enumerate(f):
        pdb = pdb.rstrip("\n")
        if len(pdb) == 4:
            pdb="pdb"+pdb
        elif len(pdb) == 5:
            chain = pdb[4:5]
            pdb="pdb"+pdb[0:4]
        if len(pdb) < 7 or not pdb.startswith('pdb'):
            raise modpipe.FileFormatError(
                   "PDB entries should be of the form pdb1abc.ent; " + \
                   "encountered %s in %s, line %d" % (pdb, pdblist, n + 1))
        # Extract 4-letter code
        pdb4=pdb[3:7]

        # Extract chains from file
        filelist = []
        count = 0
        try:
            if len(chain) == 1:
                mdl = Model(env, file=pdb, model_segment=('FIRST:'+chain, 'LAST:'+chain))
            else:
                mdl = Model(env, file=pdb, model_segment=('FIRST:@', 'LAST:'))
            for chn in mdl.chains:
                (atom_file, code) = chn.atom_file_and_code(pdb4)

                # maximal 10% non standard residues allowed in chain
                allowed_nonstdres = len(chn.residues)//10

                if chn.filter(minimal_chain_length=30,
                              minimal_resolution=99.0, minimal_stdres=30,
                              chop_nonstd_termini=True, max_nonstdres=allowed_nonstdres,
                              structure_types=structype):
                    chn.write(code+'.chn', atom_file, code,
                              'C; Produced by MODELLER', format=opts.fileformat,
                              chop_nonstd_termini=True)
                    filelist.append(code+'.chn')

                    count += 1
                    print("Writing chain %s of %4s (%s; %daa) %d out of %d" % \
                              (chn.name, pdb4, code, len(chn.residues), count, len(mdl.chains)))
                else:
                    print("PDB structure failed chain filter: %s" % (code))

        except Exception as e:
            print("Modeller found an error in PDB %s (%s)" % (pdb, str(e)))

        # Cat individual chains into a single file, if needed
        if fhout:
            for fname in filelist:
                with open(fname) as fhin:
                    shutil.copyfileobj(fhin, fhout)
                os.unlink(fname)
    f.close()
    if fhout and fhout is not sys.stdout:
        fhout.close()

if __name__ == "__main__":
    main()
