/*  File: dnasubs.c
 *  Author: Jean Thierry-Mieg (mieg@mrc-lmba.cam.ac.uk)
 *  Copyright (C) J Thierry-Mieg and R Durbin, 1991
 *-------------------------------------------------------------------
 * This file is part of the ACEDB genome database package, written by
 * 	Richard Durbin (MRC LMB, UK) rd@mrc-lmba.cam.ac.uk, and
 *	Jean Thierry-Mieg (CRBM du CNRS, France) mieg@frmop11.bitnet
 *
 * Description:
 **  Packs an upacks dna arrays.                             
 * Exported functions:
 * HISTORY:
 * Last edited: Mar 23 13:06 1992 (rd)
 * * Oct 23 20:16 1991 (mieg): Change + to n in decodeChar
 * Created: Wed Oct 23 18:10:21 1991 (mieg)
 *-------------------------------------------------------------------
 */

#include "acedb.h"
#include "a.h"
#include "bs.h"
#include "dna.h"
#include "lex.h"
#include "classes.wrm"
#include "tags.wrm"

static Array dnaUnpackArray(Array pack) ;

/* this is the mapping use to print out */
char dnaDecodeChar[] =
 { '-', 'a','t','w','g','r','k','d','c','m','y','h','s','v','b','n' } ;
char rnaDecodeChar[] =
 { '-', 'a','u','w','g','r','k','d','c','m','y','h','s','v','b','n' } ;
/* this is the mapping use to parse in */
static char dnaEncodeChar[] =
{  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  N_,   0,   0,
   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

   0,  A_,  B_,  C_,  D_,   0,   0,  G_,  H_,   0,   0,  K_,   0,  M_,  N_,   0,
   0,   0,  R_,  S_,  T_,  U_,  V_,  W_,   0,  Y_,   0,   0,   0,   0,   0,   0,
   0,  A_,  B_,  C_,  D_,   0,   0,  G_,  H_,   0,   0,  K_,   0,  M_,  N_,   0,
   0,   0,  R_,  S_,  T_,  U_,  V_,  W_,   0,  Y_,   0,   0,   0,   0,   0,   0,
} ;

/**************************************************************/
/*************************************/
    /* Decode string works in a static buffer, next 3 en place */
char * dnaDecodeString(char *cp)
{ static char buf[256] ;
  int i = 255 ;
  char *cq = buf ;

  while(i-- && *cp)
    *cq++ = dnaDecodeChar[((int) *cp++) & 0xFF] ;
  *cq = 0 ;
  if(*cp)
    messout
("Warning, dnaDecodeString receives toolong a buffer.") ;
  return buf ;
}

/*************************************/

void dnaDecodeArray(Array a)
{ int n = arrayMax(a) ;
  char *cp = arrp(a,0,char) ;

  cp-- ;
  while(++cp, n--)
    *cp = dnaDecodeChar[((int) *cp) & 0xFF] ;
}

/*************************************/

void dnaEncodeString(char *cp)
{ --cp ;
  while(*++cp)
    *cp = dnaEncodeChar[((int) *cp) & 0xFF] ;
}

/*************************************/

void dnaEncodeArray(Array a)
{ int n = arrayMax(a) ;
  char *cp = arrp(a,0,char) ;

  cp-- ;
  while(++cp, n--)
    *cp = dnaEncodeChar[((int) *cp) & 0xFF] ;
}

/************************************/

BOOL dnaDump (FILE* f, Stack buf, Array dna) 

{ register int i, j ;
  char *cp, buffer[52] ;
  Array a ;

  if (dna->size != sizeof(char))
	return FALSE ;

  if(!f && !buffer)
    return TRUE ;
  
  a = dnaUnpackArray(dna) ; 
  dnaDecodeArray(a) ;
  i = arrayMax(a) ;
  cp = arrp(a,0,char) ;
  if (f)
    {
      while(i>0)
	{ for(j=0; j<50 && i--;)
	    buffer[j++] = *cp++ ;
	  buffer[j] = 0 ;
	  fprintf(f,"         %s\n", buffer) ;
	}
      fputc('\n',f) ;
    }
  else
    {
      if (!stackExists(buf))
	messcrash("dnaDump received a bad buffer") ;
      catText(buf, "    ") ;
      catText(buf, cp) ;
    }

  arrayDestroy(a) ;
  return TRUE ;
}

/************************************/

BOOL dnaParse (int level, KEY key)
{
  char *cp, c = 0, c1;
  OBJ obj ; KEY seq ;
  Array dna = arrayCreate(5000,char) ;
  register int i = 0 ;

  while (freecard(level) && (cp = freeword()))
    while (c = *cp++)
      { if(c1 = dnaEncodeChar[((int) c) & 0xFF]) /* accepted base codes */
	  array(dna,i++,char) = c1 ;
	else
	  switch(c)
	    {                         /* accepted tabulations */
	    case '\n': case '\t': case ' ':
	      break;
	    default:                /* error in the input file */
	      goto abort ;
	    }
      }

  if (i)
    { dnaStoreDestroy(key,dna) ;
      lexaddkey(name(key), &seq, _VSequence) ;
      if (obj = bsUpdate(seq))
	{ bsAddKey(obj, _DNA, key) ;
	  bsSave(obj) ;
	}
      return TRUE ;
    }

 abort:
  fprintf (stderr, "Line %7d  DNA %.25s : bad char %c\n", 
	   freestreamline(level), name(key), c) ;
  messdump (
	    "  DNAparse error at line %7d in %.25s : bad char %c\n", 
	    freestreamline(level), name(key), c) ;
  arrayDestroy(dna) ;
  return FALSE ;
}

/*************************************/
/*************************************/
#define MAGIC_PACK 7 
#define MAGIC_PACK_ODD 8 
#define MAGIC_DOUBLE_PACK 6

static int dnaPackingType(Array dna)
{ int n = arrayMax(dna) ;
  char *cp = arrp(dna,0,char) ;

  if(n<4)   /* no packing */
    return 0 ;
  
  while(n--)
    switch(*cp++)
      { 
      case A_: case T_: case G_: case C_:
	break ;
      default:   /* at least one ambiguous base */
	return 1 ;
      }
  return 2 ;  /* double packing, only a t g c present */
}

/********************/

void dnaStoreDestroy(KEY key, Array dna)
{ char *cp , *cq , c1, *base ;
  int m = arrayMax(dna) ;
  char dbp[16] ;

  dbp[A_] = 0 ;  dbp[T_] = 1 ;  dbp[G_] = 2 ;  dbp[C_] = 3 ;

  switch(dnaPackingType(dna))
    {
    case 0:   /* no packing , no coding */
      dnaDecodeArray(dna) ;
      break ;

    case 1:  /* 2 bases per byte */
      c1  = (array(dna,0,char)  << 4) | array(dna,1,char) ;
      array(dna,0,char) = m%2 ? MAGIC_PACK_ODD : MAGIC_PACK ;
      array(dna,1,char) = c1 ;  /* first 2 bases */

      /* all the rest but possibly one */
      base =  arrp(dna,0,char) ;
      cp = cq =  arrp(dna,2,char) ;
      m -= 2 ;
      while(m>1)
	{
	  *cp++ = ((*cq) << 4 ) | *(cq + 1) ;
	  cq += 2 ;
	  m -= 2 ;
	}
      if(m)              /* last base in odd case */
	{
	  *cp++ = *cq << 4 ;
	}
      arrayMax(dna) = cp - base ; 
      break ;
      
    case 2:  /* 4 bases per byte */
      cq = arrp(dna,0,char) ;
      c1 =
	(dbp[((int) *cq) & 0xFF] << 6 ) |
	  (dbp[((int) *(cq+1)) & 0xFF] << 4 ) |
	    (dbp[((int) *(cq+2)) & 0xFF] << 2 ) |
	      dbp[((int) *(cq+3)) & 0xFF] ;
      array(dna,0,char) =  MAGIC_DOUBLE_PACK ;
      array(dna,1,char) = m%4 ;
      array(dna,2,char) = c1 ;  /* first 4 bases */

                              /* all the rest but possibly 3 */
      base =  arrp(dna,0,char) ;
      cp =  arrp(dna,3,char) ;
      cq =  arrp(dna,4,char) ;
      m -= 4 ;
      while(m>3)
	{
	  *cp++ =
	    (dbp[((int) *cq) & 0xFF] << 6 ) |
	      (dbp[((int) *(cq+1)) & 0xFF] << 4 ) |
		(dbp[((int) *(cq+2)) & 0xFF] << 2 ) |
		  dbp[((int) *(cq+3)) & 0xFF] ;
	  cq += 4 ;
	  m -= 4 ;
	}

      if(m--)              /* last 3 bases */
	{ base-- ; /* to fix arrayMax, without using cp++ */
	  *cp = (dbp[((int) *cq++) & 0xFF] << 6 ) ;
	  if(m--)
	    { *cp |= (dbp[((int) *cq++) & 0xFF] << 4 ) ;
	      if(m--)
	        *cp |= (dbp[((int) *cq++) & 0xFF] << 2) ;
	    }
	}
      arrayMax(dna) = cp - base ; 
      break ;
    }

   arrayStore(key,dna,"c") ;
   arrayDestroy(dna) ;
 }

/**************************************************************************/

static Array dnaUnpackArray(Array pack)
{ Array unpack ;
  char *cp , *cq , undoublepack [] = {A_, T_, G_, C_} ;
  int m, n ;

  if(!pack)
    return 0 ;
  cp = arrp(pack,0,char) ;
  m = 0 ;
  switch(*cp)
    {
    case MAGIC_PACK_ODD: /* 2 bases per byte, odd total */
      m = -1 ; /* fall through */
    case MAGIC_PACK: /* MAGIC packed form */
        n = arrayMax(pack) ;
      if(n<=1)
	  return 0 ;

      m += 2*(n-1) ;  /* skip magic, then every char is 2 base except */
	              /* last char may be a single base in ODD case */
      unpack = arrayCreate(m+1,char) ;  /* ensures zero terminated string */
      array(unpack,m-1,char) = 0 ; /* implies stackMax = m */
      cp = arrp(pack,0,char) ;  /* so as to start decoding on byte 1 */
      cq = arrp(unpack,0,char) ;
      while(cp++, m--)
	{ *cq++ = (*cp >> 4 ) & (char)(15) ;  /* first half byte */
	       /* &0xf to ensure a left zero after right shift */
	  if(m--)
	    *cq++ = *cp & (char)(15) ;      /* second half byte */
	  else
	    break ;
	}
      return unpack ;

    case MAGIC_DOUBLE_PACK:  /* 4 bases per byte */
        n = arrayMax(pack) ;
      if(n<=2)           /* first byte is MAGIC,  */
	  return 0 ;

      m = array(pack,1,char) ;  /* second byte is max%4 */
      if (!m) m=4 ;  /* favorable case last byte contains 4 bases not zero */
      m = 4*(n-2) - (4-m);
               /* skip magic, residue, then every char is 4 base except */
	              /* last char which holds residue */
      unpack = arrayCreate(m+1,char) ;  /* ensures zero terminated string */
      array(unpack,m-1,char) = 0 ; /* implies stackMax = m */
      cp = arrp(pack,1,char) ; /* so as to start decoding on byte 2 */
      cq = arrp(unpack,0,char) ;
      while(cp++, m--)
	{ *cq++ = undoublepack[(*cp >> 6 ) & (char)(3)] ;  /* first quarter */
	  if(m--)
	    *cq++ =  undoublepack[(*cp >> 4 ) & (char)(3)] ; 
	  else
	    break ;
	  if(m--)
	    *cq++ =  undoublepack[(*cp >> 2 ) & (char)(3)] ; 
	  else
	    break ;
	  if(m--)
	    *cq++ =  undoublepack[(*cp ) & (char)(3)] ; 
	  else
	    break ;
	}
      return unpack ;

    default:    /* uncoded char form, rare I hope */
      dnaEncodeArray(pack) ;
      return pack ;
      break ;
    }
}

/**************************************************************************/

Array dnaGet(KEY key)
{ Array pack ;
  Array unpack ;

  pack = arrayGet(key, char,"c") ;
  if(!pack)
    return 0 ;
  unpack = dnaUnpackArray(pack) ;
  if(pack != unpack)
    arrayDestroy(pack) ;
  return unpack ;
}

/**********************************************************/
/**********************************************************/
   /* called also from dnacptfastaDump */
BOOL dumpFastA (KEY key, KEY title,
		Array a, int from, int length, FILE* fil)
{ int i, j , end ;
  
  if (from < 0)
    from = 0 ;
  end = from + length ;
  if (end > arrayMax(a))
    end = arrayMax(a) ;

  if (a)
    { if (strchr (name(key), ':'))
	fprintf (fil, "\n\n> %s ", name(key)) ;
      else
	fprintf (fil, "\n\n> ACEDB:%s ", name(key)) ;
      fprintf (fil, "\t %s from %d length %d\n", iskey(title) ? name(title) : "",
	       from, length) ;
      for (i = from ; i < end ;)
	{ for (j = 50 ; i < end && j-- ;)
	    fputc (dnaDecodeChar[((int) arr(a, i++, char)) & 0xFF], fil) ;
	  fputc ('\n',fil) ;
	}
      return TRUE ;
    }
  else
    return FALSE ;
}


/**********************************************************/

FILE * dnaFileOpen(void)
{ static char fileName[24],dirName[80] ;
  FILE *fil = filqueryopen (dirName, fileName, "fasta", "w") ;
    
  if (!fil)
    messout ("failed to open fasta file") ;
  return fil ;
}

/**********************************************************/

void dumpKeySetFastA (KEYSET kSet)
{
  OBJ obj ;
  KEYSET dnaSet = 0 , dnaAlpha ;
  KEY dna, key, titleKey = 0 ;
  int i, j = 0 , n = 0 ;
  Array a = 0 ;
  FILE *fil = dnaFileOpen();

  if (!fil)
    return ;
 
  dnaSet = keySetCreate() ;
  for (i = 0 ; i < keySetMax(kSet) ; ++i)
    { key = keySet (kSet, i) ;
      
      if (iskey (key) != 2)
	continue ;
      if ( class (key) == _VSequence)
	{ if (obj = bsCreate (key))
	    { if (bsGetKey (obj, _DNA, &dna))
		keySet(dnaSet,j++) = dna ;
	      bsDestroy (obj) ;
	    }
	  continue ;
	}
      else if (class (key) == _VDNA)
	keySet(dnaSet,j++) = key ;
    }

  keySetSort(dnaSet) ;
  keySetCompress(dnaSet) ;

  dnaAlpha =
    keySetAlphaHeap(dnaSet, keySetMax(dnaSet)) ;

  for (i = 0 ; i < keySetMax(dnaAlpha) ; ++i)
    { dna = keySet(dnaAlpha, i) ;
      a = dnaGet (dna) ; 
      if (a)
	{  titleKey = 0 ;
	   if (lexReClass(dna, &key, _VSequence) &&
	       (obj = bsCreate(key)))
	     { 
	       bsGetKey (obj, _Title, &titleKey) ;
	       bsDestroy (obj) ;
	     }
	   if (dumpFastA (key, titleKey, a, 0, arrayMax(a),fil))
	     n++ ;
	   arrayDestroy(a) ; /* watch out Richard you often forget it */
	}
    }

  keySetDestroy(dnaSet) ;
  keySetDestroy(dnaAlpha) ;
  fclose (fil) ;

  messout("I wrote %d sequences", n) ;
}

/**********************************************************/
/**********************************************************/
 
