# This file is part of ModPipe, Copyright 1997-2020 Andrej Sali
#
# ModPipe is free software: you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ModPipe.  If not, see <http://www.gnu.org/licenses/>.

from __future__ import print_function
import subprocess, os, re
import modpipe.binaries

class CDHit:
    """Defines a CDHit object and methods."""

    def __init__(self, seqfile):
        self.seqfile = seqfile
        if not os.path.isfile(seqfile):
            print("File not found: %s" % seqfile)
            raise
        if os.path.getsize(seqfile) == 0:
            print("File %s is empty!" % seqfile)
            raise

        self.exe = modpipe.binaries.get_cd_hit()
        if not os.path.isfile(self.exe):
            print("Could not find CD-HIT executable: %s" % self.exe)
            raise

        self.clustrfile = ''
        self.clusters = {}


    def cluster(self, outfile, sequence_identity=95, throw_away_seq_length=30,
                length_difference=10):
        """Take an input file in fasta format and cluster it
        at the given sequence identity threshold. It will return
        the stdout, stderr of the program and the name of the file
        containing the clusters."""

        # Fix the clustr file
        (a, b) = os.path.split(outfile)
        (c, d) = os.path.splitext(b)
        self.clustrfile = c + '.clstr'

        # Renormalize sequence identity
        if sequence_identity > 1:
            sequence_identity = float(sequence_identity)/100
        sequence_identity = "%4.2f" % sequence_identity

        # Actually, run cdhit
        stdout, stderr = subprocess.Popen([self.exe, "-i", self.seqfile,
                           "-o", outfile,
                           "-n", str(5), "-c", str(sequence_identity),
                           "-M", str(1000), "-l", str(throw_away_seq_length),
                           "-S", str(length_difference)],
                           stdout=subprocess.PIPE).communicate()

        # Check if cluster file exists (and is non-zero)
        if stderr or not os.path.isfile(outfile) or not os.path.isfile(self.clustrfile):
            print("CD-HIT failed with the following errors:\n%s" % stderr)
            raise

        if os.path.getsize(outfile) == 0:
            print("CD-HIT produced an empty output file: %s" % outfile)
            raise

        if os.path.getsize(self.clustrfile) == 0:
            print("CD-HIT produced an empty cluster file: %s" % self.clustrfile)
            raise

        return stdout, stderr, self.clustrfile


    def parse_clusters(self):
        """Parse the entries in the cluster file."""
        f = open(self.clustrfile,'r')
        start = re.compile(r'^>Cluster')
        memb = re.compile(r'(\d+)\s+(\d+)aa,\s\>(\w+)\.\.\.\sat\s(\d+%)')
        repr = re.compile(r'(\d+)\s+(\d+)aa,\s\>(\w+)\.\.\.\s\*')
        self.clusters = {}
        seqlength = {}
        for line in f:
            while start.match(line):
                members = []
                for line in f:
                    if memb.match(line):
                        m = memb.match(line)
                        members.append(m.group(3))
                        seqlength[m.group(3)] = m.group(2)
                    elif repr.match(line):
                        r = repr.match(line)
                        representative = r.group(3)
                        members.append(r.group(3))
                        seqlength[r.group(3)] = r.group(2)
                    elif start.match(line):
                        break
                self.clusters[representative] = members
        f.close()
        return self.clusters
