/*  File: embl.c
 *  Author: Richard Durbin (rd@mrc-lmba.cam.ac.uk)
 *  Copyright (C) J Thierry-Mieg and R Durbin, 1992
 *-------------------------------------------------------------------
 * This file is part of the ACEDB genome database package, written by
 * 	Richard Durbin (MRC LMB, UK) rd@mrc-lmba.cam.ac.uk, and
 *	Jean Thierry-Mieg (CRBM du CNRS, France) mieg@frmop11.bitnet
 *
 * Description: creates an EMBL submission file from ACEDB data
 	authors for submission obtained from From_Author field
	current assumptions (not necessarily valid):
	   - all CDS's are one level down, i.e. not recursive
	   - start and end are found for all embedded sequences
	   - start < end for general features: promoters etc.
 * Exported functions:
 * HISTORY:
 * Last edited: Apr  7 06:18 1992 (rd)
 * * Feb 23 01:27 1992 (rd): order() -> join()
 * Created: Sun Feb 23 01:26:18 1992 (rd)
 *-------------------------------------------------------------------
 */

#include "acedb.h"
#include "array.h"
#include "bs.h"
#include "classes.wrm"
#include "tags.wrm"
#include "systags.wrm"
#include "dna.h"
#include "display.h"
#include "lex.h"

static FILE *fil ;		/* file being written to */
static char buf[1000] ;

/*******************************/

static char* emblifyName (char* nam)
{
  static char work[128] ;
  char *cp = work ;

  while (*nam && *nam != ' ')
    *cp++ = *nam++ ;

  while (*nam && *nam == ' ') ++nam ;

  if (*nam)			/* initial */
    { *cp++ = ' ' ;
      *cp++ = *nam ;
      *cp++ = '.' ;
    }

  *cp = 0 ;
  return work ;
}

/*******************************/

static void cdsFeatures (KEY key, int start, int end)
{
  OBJ	obj = bsCreate(key) ;
  BOOL	isDown = (end > start) ;
  int	i, j1, j2, pos ;
  KEY	method, remark ;
  char  **cpp ;
  Array a ;		/* work array for bsFlatten */
  static Stack text = 0 ;

  if (!text)
    text = stackCreate(64) ;
  
  if (!obj)
    return ;

  a = arrayCreate (32, BSunit) ;

  if (bsFindTag (obj, _Source_Exons) && bsFlatten (obj, 2, a))
    for (i = 0 ; i < arrayMax(a) ; i += 2)
      if (isDown)
	{ arr(a,i,BSunit).i += start - 1 ;
	  arr(a,i+1,BSunit).i += start - 1 ;
	}
      else
	{ arr(a,i,BSunit).i = start + 1 - arr(a,i,BSunit).i ;
	  arr(a,i+1,BSunit).i = start + 1 - arr(a,i+1,BSunit).i ;
	}
  else
    { arrayMax(a) = 0 ;
      array(a,0,BSunit).i = start ;
      array(a,1,BSunit).i = end ;
    }

  fprintf (fil, "FT   CDS             ") ; /* CDS entry */
  if (!isDown) fprintf (fil, "complement(") ;
  if (arrayMax(a) > 2) fprintf (fil, "join(") ;
  for (i = 0 ; i < arrayMax(a) ; i += 2)
    { if (i)
	fputc (',',fil) ;
      if (!((i-4)%6))
	fprintf (fil, "\nFT                   ") ;
      if (isDown)
	{ j1 = i ; j2 = j1+1 ; }
      else
	{ j1 = arrayMax(a)-i-1 ; j2 = j1-1 ; }
      fprintf (fil, "%d..%d", 
	       arr(a,j1,BSunit).i, arr(a,j2,BSunit).i) ;
    }
  if (arrayMax(a) > 2) fputc (')', fil) ;
  if (!isDown) fputc (')', fil) ;
  fputc ('\n', fil) ;

  *buf = 0 ;
  strcat (buf, messprintf ("/product=\"%s", name(key))) ;
  if (bsGetKey (obj, _CDS_predicted_by, &method))
    strcat (buf, messprintf (", protein predicted using %s",
			     name(method))) ;
  if (bsGetKey (obj, _DB_remark, &remark))
    do
      { strcat (buf, messprintf (", %s", name(remark))) ;
      } while (bsGetKey (obj, _bsDown, &remark)) ;
  strcat (buf, "\"") ;
  for (cpp = uBrokenLines(buf, 56) ; *cpp ; ++cpp)
    fprintf (fil, "FT                   %s\n", *cpp) ;
  
  if (bsGetData (obj, _TSL_site, _Int, &pos))
    fprintf (fil, 
     "FT                   /note=\"Possible TSL site at %d\"\n",
     isDown?pos+start-1:start+1-pos) ;

/* exon code commented out after agreement that it is not a good thing
  for (i = 0 ; i < arrayMax(a) ; i += 2)
    { fprintf (fil, "FT   exon            ") ;
      if (isDown)
	{ if (!i) fputc ('<', fil) ;
	  fprintf (fil,"%d..", arr(a,i,BSunit).i) ;
	  if (i+2 == arrayMax(a)) fputc ('>', fil) ;
	  fprintf (fil, "%d\n", arr(a,i+1,BSunit).i) ;
	}
      else
	{ fprintf (fil,"complement(") ;
	  if (i+2 == arrayMax(a)) fputc ('<', fil) ;
	  fprintf (fil,"%d..", arr(a,i+1,BSunit).i) ;
	  if (!i) fputc ('>',fil) ;
	  fprintf (fil,"%d)\n", arr(a,i,BSunit).i) ;
	}
      fprintf (fil, "FT                   /number=%d\n", 1 + i/2) ;
    }
*/

  bsDestroy (obj) ;
  arrayDestroy (a) ;
}

/**********************************************/

static void generalFeature (KEY tag, int start, int end, BSunit u)
{
  char **cpp ;

  switch (tag)
    {
    case _promoter: case _misc_signal: case _misc_feature:
    case _mutation: 
      fprintf (fil, "FT   %-16s%d..%d\n", name(tag), start, end) ;
      if (u.k)
	{ fprintf (fil, "FT                   /note=\"") ;
	  cpp = uBrokenLines(name(u.k), 38) ; 
	  while (TRUE)
	    { fprintf (fil, "%s", *cpp) ;
	      if (!*++cpp)
		{ fprintf (fil, "\"\n") ;
		  break ;
		}
	      fprintf (fil, "\nFT                   ") ;
	    }
	}
      break ;
    case _polyA_site: case _repeat_region: case _repeat_unit:
    case _sig_peptide: case _mat_peptide: case _old_sequence:
    case _modified_base:
      fprintf (fil, "FT   %-16s%d..%d\n", name(tag), start, end) ;
      if (u.s)
	{ fprintf (fil, "FT                   /note=\"") ;
	  cpp = uBrokenLines(u.s, 40) ; 
	  while (TRUE)
	    { fprintf (fil, "%s", *cpp) ;
	      if (!*++cpp)
		{ fprintf (fil, "\"\n") ;
		  break ;
		}
	      fprintf (fil, "\nFT                   ") ;
	    }
	}
      break ;
    default:
      messout ("Don't know how to deal with feature %s yet",
	       name(tag)) ;
    }
}

/**********************************************/

static void emblDoDump (KEY seq)
{
  static char dname[80], fname[24], seqName[24] ;
  KEY key ;
  OBJ Seq = 0 ;
  Array dna = 0 ;
  Array a  ;		/* work array for bsFlatten */
  char *translate ;
  char *message = 0, *cp, *cq, **cpp ;
  int length, i, ix, freq[5] ;
#define abort(x)	{ message = x ; goto finish ; }
#define xxout		fprintf (fil, "XX\n")

  fil = 0 ;
  a = arrayCreate (32, BSunit) ;

  if (!(class(seq) == _VSequence) || !(Seq = bsCreate(seq)))
    abort("Sequence object missing") ;
  if (!bsFindTag (Seq, _Genomic_Canonical))
    abort("Not a genomic cosmid") ;
  if (!bsGetKey (Seq, _DNA, &key) || !(dna = dnaGet(key)))
    abort ("No DNA attached to Sequence object") ;
  if (!(fil = filqueryopen (dname, fname, "embl", "w")))
    goto finish ;

  length = arrayMax(dna) ;

  cp = seqName ; cq = name(seq) ; 
  while (*cq) 
    *cp++ = freeupper(*cq++) ;

  fprintf (fil, "ID   CE%s nematode; DNA; INV; %d BP.\n", 
	   seqName, length) ;
  xxout ;
  fprintf (fil, "DE   Caenorhabditis elegans cosmid %s\n", 
	   seqName) ;
  xxout ;
  fprintf (fil, "KW   .\n") ;
  xxout ;
  fprintf (fil,
	   "OS   Caenorhabditis elegans (nematode)\n"
	   "OC   Eukaryota; Animalia; Metazoa; Nemata; Secernentea; Rhabditia ;\n"
	   "OC   Rhabditida; Rhabditina; ; Rhabditoidea; Rhabditidae.\n") ;
  xxout ;
  fprintf (fil, "RN   [1]\n") ;
  fprintf (fil, "RP   1-%d\n", length) ;
  
  ix = 0 ;
  if (bsFindTag (Seq, _From_Author) && bsFlatten (Seq, 1, a))
    { for (i = 0 ; i < arrayMax(a) ; i += 1)
	{ cp = emblifyName (name(arr(a,i,BSunit).k)) ;
	  if (ix + strlen (cp) > 70)
	    { fprintf (fil, ",\n") ; ix = 0 ; }
	  if (ix)
	    { fprintf (fil, ", ") ;  ix += 2 ; }
	  else
	    { fprintf (fil, "RA   ") ; ix += 5 ; }
	  fprintf (fil, cp) ;
	  ix += strlen(cp) ;
	}
      fprintf (fil, ";\n") ;
    }
  else
    { messout ("No authors") ;
      fprintf (fil, "RA   ;\n") ;
    }
  fprintf (fil,
	   "RT   ;\n"
	   "RL   Submitted (??) to the EMBL Data Library by:\n"
	   "RL   Nematode Sequencing Project, MRC Laboratory of Molecular Biology,\n"
	   "RL   Cambridge CB2 2QH, England and Department of Genetics, Washington\n"
	   "RL   University, St. Louis, MO 63110, USA. E-mail:\n"
	   "RL   jes@mrc-lmba.cambridge.ac.uk or rw@nematode.wustl.edu\n") ;
  xxout ;
  fprintf (fil,
	   "RN   [2]\n"
	   "RA   Sulston J., Du Z., Thomas K., Wilson R., Hillier L., Staden R., \n"
	   "RA   Halloran N., Green P., Thierry-Mieg J., Qiu L., Dear S., \n"
	   "RA   Coulson A., Craxton M., Durbin R., Berks M., Metzstein M., \n"
	   "RA   Hawkins T., Ainscough R., Waterston R.;\n"
	   "RT   \"The C. elegans Sequencing Project: A Beginning\"; \n"
	   "RL   Nature 356:37-41 (1992).\n") ;
  xxout ;
  fprintf (fil,
	   "RN   [3]\n"
	   "RA   Roberts L.;\n"
	   "RT   \"The Worm Project\"; \n"
	   "RL   Science 248:1310-1313(1990).\n") ;
  xxout ;

  fprintf (fil, "CC   NOTES:\n") ;
  if (bsFindTag (Seq, _DB_remark) && bsFlatten (Seq, 1, a))
    for (i = 0 ; i < arrayMax(a) ; ++i)
      for (cpp = uBrokenLines(name(arr(a,i,BSunit).k),67)
	   					; *cpp ; ++cpp)
	fprintf (fil,"CC   %s\n",*cpp) ;
  if (bsFindTag (Seq, _Has_CDS))
    fprintf (fil, 
     "CC   Coding sequences below are predicted from computer analysis,\n"
     "CC   using the program Genefinder (P. Green, mss in preparation).\n") ;
  if (bsFindTag (Seq, _TSL_site))
    fprintf (fil, "CC   \"TSL\" = trans-spliced leader.\n") ;
  xxout ;

  fprintf (fil, "FH   Key             Location/Qualifiers\nFH\n") ;

  if (bsFindTag (Seq, _Features) && bsFlatten (Seq, 4, a))
    for (i = 0 ; i < arrayMax(a) ; i += 4)
      { generalFeature (arr(a,i,BSunit).k,
			arr(a,i+1,BSunit).i,
			arr(a,i+2,BSunit).i,
			arr(a,i+3,BSunit)) ;
	if (arr(a,i+1,BSunit).i < 1 || arr(a,i+1,BSunit).i > length ||
	    arr(a,i+2,BSunit).i < 1 || arr(a,i+2,BSunit).i > length)
	  messout ("Beware - feature %s extends beyond the cosmid",
		   name(arr(a,i,BSunit).k)) ;
      }

  if (bsFindTag (Seq, _Has_CDS) && bsFlatten (Seq, 3, a))
    for (i = 0 ; i < arrayMax(a) ; i += 3)
      { cdsFeatures (arr(a,i,BSunit).k,
		     arr(a,i+1,BSunit).i,
		     arr(a,i+2,BSunit).i) ;
	if (arr(a,i+1,BSunit).i < 1 || arr(a,i+1,BSunit).i > length ||
	    arr(a,i+2,BSunit).i < 1 || arr(a,i+2,BSunit).i > length)
	  messout ("Beware - CDS %s extends beyond the cosmid",
		   name(arr(a,i,BSunit).k)) ;
      }
  xxout ;

  for (i = 5 ; i-- ; )		/* the sequence itself */
    freq[i] = 0 ;
  for (i = arrayMax(dna) ; i-- ; )
    switch (arr(dna,i,char))
      {
      case A_: ++freq[0] ; break ;
      case C_: ++freq[1] ; break ;
      case G_: ++freq[2] ; break ;
      case T_: ++freq[3] ; break ;
      default:	++freq[4] ; break ;
      }
  fprintf (fil, "SQ   Sequence  %d BP;   %d A; %d C; %d G; %d T; %d other;",
	   length, freq[0], freq[1], freq[2], freq[3], freq[4]) ;

  if (bsFindTag (Seq, _RNA))
    translate = rnaDecodeChar ;
  else
    translate = dnaDecodeChar ;
  for (i = 0 ; i < arrayMax(dna) ; ++i)
    if (!(i%60))
      fprintf (fil, "\n     %c", translate[arr(dna,i,char)]) ;
    else if (!(i%10))
      fprintf (fil, " %c", translate[arr(dna,i,char)]) ;
    else
      fputc (translate[arr(dna,i,char)], fil) ;
  fprintf (fil, "\n//\n") ;

finish:
  if (message) messout (message) ;
  if (fil) fclose (fil) ;
  bsDestroy (Seq) ;
  arrayDestroy (dna) ;
  arrayDestroy (a) ;
}

/*******************************/

void emblDump (void)
{
  displayBlock (emblDoDump," I will embl dump the corresponding sequence ") ;
}
