/*
  mfstlib3.c -- reading Multi-FASTA formatted sequences

    Copyright (C) 1999,2000 Naohisa Goto <ngoto@gen-info.osaka-u.ac.jp>
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
   
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
   
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

/* mfstlib3.c 19990806 20000403 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/stat.h>
#include <unistd.h>
#include "mfstlib3.h"

#define TMPMEM_BLOCKSIZE 65536
#define TMPMEM_TH 64

typedef unsigned long tmpMem_len_t;

struct tmpMem_block {
  tmpMem_len_t len;
  struct tmpMem_block *next_block;
  unsigned char *memory;
};

struct tmpMem {
  FILE *fp;
  int blocks;
  struct tmpMem_block *top_block;
  tmpMem_len_t len;
  struct tmpMem_block *current_block;
  unsigned char *current_ptr;
  tmpMem_len_t current_rest;
};

/* global variables */
static int sg_width = 60;
static FILE *sg_outfile = NULL;

static int sg_complement_dna[][2] = {
  {  'a', 't' },
  {  't', 'a' },
  {  'g', 'c' },
  {  'c', 'g' },
  {  'A', 'T' },
  {  'T', 'A' },
  {  'G', 'C' },
  {  'C', 'G' },
  {  'u', 'a' },
  {  'U', 'A' },
  {  -1, -1 }
};

static int sg_complement_rna[][2] = {
  {  'a', 'u' },
  {  'u', 'a' },
  {  'g', 'c' },
  {  'c', 'g' },
  {  'A', 'U' },
  {  'U', 'A' },
  {  'G', 'C' },
  {  'C', 'G' },
  {  't', 'a' },
  {  'T', 'A' },
  {  -1, -1 }
};

static int sg_torna[][2] = {
  { 't', 'u' },
  { 'T', 'U' },
  { -1, -1 }
};

static int sg_todna[][2] = {
  { 'u', 't' },
  { 'U', 'T' },
  { -1, -1 }
};

/* prototypes */
/* function about tmpMem */
static struct tmpMem *tmpMem_open(FILE *fp);
static void tmpMem_close(struct tmpMem *p);
static void *tmpMem_export_to_malloc(struct tmpMem *p);
static tmpMem_len_t tmpMem_get_len(struct tmpMem *p);
static int tmpMem_putc(struct tmpMem *p, const int c);
static struct tmpMem *tmpMem_reset_full(struct tmpMem *p, FILE *fp);
static void tmpMem_reset(struct tmpMem *p);

static struct tmpMem_block *tmpMem_freeBlock(struct tmpMem_block *bp);
static void *tmpMem_useNextBlock(struct tmpMem *p);
static void *tmpMem_addNewBlock(struct tmpMem *p, tmpMem_len_t blocksize);

/* function about reading fasta */
static int read_fasta_custom(SEQUENCE *sp, FILE *fp, struct tmpMem *tp);
static int is_start_fasta(int c);
static int is_end_line(int c);
static int is_proper_character(int c);

/* *************************************************************** */
int set_output(FILE *fp)
{
  sg_outfile = fp;
  return 0;
} /* end of func */

int set_width(int width)
{
  if (width > 0) sg_width = width;
  return sg_width;
} /* end of func */

/* ********************************************************* */
#define SEQ_MAGIC 0x14142135
#define SEQ_MAGIC_MALLOC 0x17320508

SEQUENCE *seq_open(void)
{
  SEQUENCE *p;

  p = malloc(sizeof(SEQUENCE));
  if (p == NULL) return NULL;

  seq_init(p);
  p->magic = SEQ_MAGIC_MALLOC;
  return p;
} /* end of func */

void seq_init(SEQUENCE *p)
{
  p->magic = SEQ_MAGIC;
  p->namelen = 0;
  p->seqlen = 0;
  p->name = NULL;
  p->seq = NULL;

  return;
} /* end of func */

int seq_close(SEQUENCE *p)
{
  if (p == NULL) return -1;

  switch (p->magic) {
  case SEQ_MAGIC:
  case SEQ_MAGIC_MALLOC:
    if (p->name != NULL) free(p->name);
    if (p->seq != NULL) free(p->seq);
    break;
  default:
    return -1;
  }

  switch (p->magic) {
  case SEQ_MAGIC:
    seq_init(p);
    break;
  case SEQ_MAGIC_MALLOC:
    free(p);
    break;
  }

  return 0;
} /* end of func */

/* ********************************************************* */
static struct tmpMem *sg_tmpMem_h = NULL;

int seq_garbage_collection(void)
{
  tmpMem_close(sg_tmpMem_h);
  sg_tmpMem_h = NULL;
  return 0;
} /* end of func */

int get_fasta(SEQUENCE *p, FILE *infile)
{
  if (sg_tmpMem_h == NULL) {
    sg_tmpMem_h = tmpMem_open(infile);
    if (sg_tmpMem_h == NULL) return -2;
  } else {
    tmpMem_reset(sg_tmpMem_h);
  }

  return read_fasta_custom(p, infile, sg_tmpMem_h);
} /* end of func */

/* ************************************************************** */
/* new functions of mfstlib3 */
int seq_duplicate(SEQUENCE *src, SEQUENCE *dst)
{
  if (src == NULL || dst == NULL) return -1;
  if (dst->name != NULL || dst->seq != NULL) return -1;

  dst->name = malloc(sizeof(unsigned char) * (src->namelen + 1));
  dst->seq = malloc(sizeof(unsigned char) * (src->seqlen + 1));

  if (dst->name == NULL || dst->seq == NULL) return -2;

  dst->namelen = src->namelen;
  memcpy(dst->name, src->name, dst->namelen + 1);
  dst->seqlen = src->seqlen;
  memcpy(dst->seq, src->seq, dst->seqlen + 1);
  return 0;
} /* end of func */


int seq_resize_seq(SEQUENCE *s, int newsize)
{
  unsigned char *ptr;

  if (newsize < 0) return -3;
  ptr = malloc(sizeof(unsigned char) * (newsize + 1));
  if (ptr == NULL) return -2;

  if (s->seq != NULL) {
    memcpy(ptr, s->seq, newsize + 1);
    free(s->seq);
  }
  ptr[newsize] = '\0';

  s->seq = ptr;
  s->seqlen = newsize;
  return 0;
} /* end of func */

int seq_resize_name(SEQUENCE *s, int newsize)
{
  unsigned char *ptr;

  if (newsize < 0) return -3;
  ptr = malloc(sizeof(unsigned char) * (newsize + 1));
  if (ptr == NULL) return -2;

  if (s->seq != NULL) {
    memcpy(ptr, s->name, newsize + 1);
    free(s->name);
  }
  ptr[newsize] = '\0';

  s->name = ptr;
  s->namelen = newsize;
  return 0;
} /* end of func */

int seq_convertall_reverse(SEQUENCE *s, int *table)
{
  int h, i, j;
  int itmp;

  j = seq_getsize(s);
  if (j <= 0) return 0;

  h =  j / 2;
  j -= 1;
  for (i = 0; i < h; i++, j--) {
    itmp = seq_getseq(s)[i];
    seq_getseq(s)[i] = table[seq_getseq(s)[j]];
    seq_getseq(s)[j] = table[itmp];
  }

  if (j == i) {
    seq_getseq(s)[i] = table[seq_getseq(s)[i]];
  }

  return 0;
} /* end of func */

int seq_count_content(SEQUENCE *p, int *table, unsigned char *cstr)
/* cstrƬ0ΤȤΤߡ0⥫Ȥ롣 */
{
  int i, c;
  int sum = 0;
  static int t[TABLESIZE];
  int len = seq_getsize(p);

  for (i = 0; i < TABLESIZE; i++) t[i] = 0;

  for (i = 0; i < len; i++) {
    t[seq_getchar(p, i)] += 1;
  }

  i = 0;
  c = *cstr++;
  do {
    table[i++] = t[c];
    t[c] = 0;
    c = *cstr++;
  } while (c != '\0');

  for (i = 0; i < TABLESIZE; i++) {
    sum += t[i];
  }

  return sum;
} /* end of func */

/* ************************************************************** */
/* function about tmpMem */
static struct tmpMem *tmpMem_open(FILE *fp)
{
  struct tmpMem *tp, *tp2;

  tp = malloc(sizeof(struct tmpMem));
  if (tp == NULL) return NULL;

  tp->fp = fp;
  tp->len = 0;
  tp->blocks = 0;
  tp->top_block = NULL;
  tp->current_block = NULL;
  tp2 = tmpMem_reset_full(tp, fp);
  if (tp2 == NULL) {
    free(tp);
    return NULL;
  }

  return tp;
} /* end of func */

static struct tmpMem *tmpMem_reset_full(struct tmpMem *p, FILE *fp)
{
  int r;
  tmpMem_len_t blocksize;
  off_t filesize;
  struct stat FileStatus;

  if (p == NULL) return NULL;
  if (p->top_block != NULL) {
    tmpMem_reset(p);
    return p;
  }

  filesize = 0;
  if (fp != NULL) {
    r = fstat(fileno(fp), &FileStatus);
    if (r == 0) {
      filesize = FileStatus.st_size;
      if (S_ISREG(FileStatus.st_mode) == 0) filesize = 0;
    }
  }

  if (filesize > 0) blocksize = filesize;
  else blocksize = TMPMEM_BLOCKSIZE;

  p->fp = fp;
  p->len = 0;
  p->blocks = 0;
  p->top_block = NULL;
  p->current_block = NULL;
  if (tmpMem_addNewBlock(p, blocksize) == NULL) return NULL;
  p->top_block = p->current_block;

  return p;
} /* end of func */

static void tmpMem_close(struct tmpMem *p)
{
  struct tmpMem_block *bp;

  if (p != NULL) {
    bp = p->top_block;
    while (bp != NULL) {
      bp = tmpMem_freeBlock(bp);
    }
    free(p);
  }
  return;
} /* end of func */

static struct tmpMem_block *tmpMem_freeBlock(struct tmpMem_block *bp)
{
  struct tmpMem_block *bp_next;

  if (bp == NULL) return NULL;
  bp_next = bp->next_block;
  free(bp->memory);
  free(bp);
  return bp_next;
} /* end of func */

static void *tmpMem_useNextBlock(struct tmpMem *p)
{
  struct tmpMem_block *bp;

  bp = p->current_block;
  if (bp == NULL || bp->next_block == NULL) {
    return tmpMem_addNewBlock(p, TMPMEM_BLOCKSIZE);
  }

  bp = bp->next_block;
  p->current_block = bp;

  p->current_ptr = bp->memory;
  p->current_rest = bp->len;

  return bp;
} /* end of func */

static  void *tmpMem_addNewBlock(struct tmpMem *p, tmpMem_len_t blocksize)
{
  struct tmpMem_block *bp;

  bp = malloc(sizeof(struct tmpMem_block));
  if (bp == NULL) return NULL;

  bp->memory = malloc(blocksize);
  if (bp->memory == NULL) {
    free(bp);
    return NULL;
  }
  bp->len = blocksize;
  bp->next_block = NULL;

  if (p->current_block != NULL) {
    p->current_block->next_block = bp;
  }
  p->blocks += 1;

  p->current_rest = blocksize;
  p->current_block = bp;
  p->current_ptr = bp->memory;
  if (p->top_block == NULL) {
    p->top_block = bp;
  }

  return (void *)bp;
} /* end of func */

static void *tmpMem_export_to_malloc(struct tmpMem *p)
{
  unsigned char *malloc_mem;
  unsigned char *mem;
  struct tmpMem_block *bp;
  tmpMem_len_t restlen, copylen;

#ifdef DEBUG
  fprintf(stderr, "tmpMem p->blocks=%d\n", p->blocks);
#endif /* DEBUG */
  if (p->blocks == 1) {
    if (p->len + TMPMEM_TH >= p->current_block->len
	|| (p->len >= TMPMEM_BLOCKSIZE && 
	    p->len / 2 * 3 >=  p->current_block->len)
	) {
      malloc_mem = realloc(p->current_block->memory, p->len);
      p->blocks = 0;
      p->top_block = NULL;
      p->current_block = NULL;
      tmpMem_reset(p);
      return malloc_mem;
    }
  } /* if (p->blocks == 1) */

  malloc_mem = malloc(p->len);
  if (malloc_mem != NULL) {
    mem = malloc_mem;
    restlen = p->len;
    bp = p->top_block;
    while (bp != NULL && restlen > 0) {
      copylen = (restlen < bp->len ? restlen : bp->len);
      memcpy(mem, bp->memory, copylen);
      mem += copylen;
      restlen -= copylen;
      bp = bp->next_block;
    }
  }

  tmpMem_reset(p);
  return malloc_mem;
} /* end of func */

static void tmpMem_reset(struct tmpMem *p)
{
  p->len = 0;
  p->current_block = p->top_block;
  if (p->current_block != NULL) {
    p->current_ptr = p->current_block->memory;
    p->current_rest = p->current_block->len;
  } else {
    p->current_ptr = NULL;
    p->current_rest = 0;
  }
 
  return;
} /* end of func */

static tmpMem_len_t tmpMem_get_len(struct tmpMem *p)
{
  return p->len;
} /* end of func */

static int tmpMem_putc(struct tmpMem *p, const int c)
{
  if (p->current_rest <= 0) {
    if (tmpMem_useNextBlock(p) == NULL) return EOF;
  }

  *((p->current_ptr)++) = (unsigned char)c;
  p->len++;
  p->current_rest--;
  return 0;
} /* end of func */

/* *************************************************************** */
int out_seq_putc(int c)
{
  static int width_count = 0;
  static int auto_newline = 0; /* if automatic newline, set 1 */
  int r;

  if (auto_newline && c == '\n') {
    auto_newline = 0;
    return 0;
  } else auto_newline = 0;

/*  r = putc(toupper(c), sg_outfile); */
  r = putc(c, sg_outfile);
  if (c == '\n') {
    width_count = 0;
  } else if (++width_count >= sg_width) {
    r = putc('\n', sg_outfile);
    width_count = 0;
    auto_newline = 1;
  }

  return r;
} /* end of func */

int print_seq(SEQUENCE *p, FILE *fpo, long st, long ed,
 int rev, int flags_out)
{
  long l;

  set_output(fpo);

  if (rev) {
    for (l = ed - 1; l >= st - 1; l--) {
      out_seq_putc(seq_getchar(p, l));
    }
  } else {
    for (l = st - 1; l < ed; l++) {
      out_seq_putc(seq_getchar(p, l));
    }
  }

  if (flags_out & FLAG_NORET) {
    fflush(fpo);
  } else {
    out_seq_putc('\n');
  }
  return 0;
} /* end of func */

int print_seq_c(SEQUENCE *p, FILE *fpo, long st, long ed,
 int rev, int flags_out, int *table)
{
  long l;

  set_output(fpo);

  if (rev) {
    for (l = ed - 1; l >= st - 1; l--) {
      out_seq_putc(table[seq_getchar(p, l)]);
    }
  } else {
    for (l = st - 1; l < ed; l++) {
      out_seq_putc(table[seq_getchar(p, l)]);
    }
  }

  if (flags_out & FLAG_NORET) {
    fflush(fpo);
  } else {
    out_seq_putc('\n');
  }
  return 0;
} /* end of func */

int *get_convert_table(int type)
{
  static int table[TABLESIZE];
  static int tabletype = 0;
  int i, j;

  if (tabletype != 0 && tabletype == type) return table;

  tabletype = type;
  for (i = 0; i < TABLESIZE; i++) table[i] = i;

  switch (type & CMASK_111) {
  case CONVERT_COMPLEMENT_DNA:
    for (i = 0; (j = sg_complement_dna[i][0]) != -1; i++) {
      table[j] = sg_complement_dna[i][1];
    }
    break;
  case CONVERT_COMPLEMENT_RNA:
    for (i = 0; (j = sg_complement_rna[i][0]) != -1; i++) {
      table[j] = sg_complement_rna[i][1];
    }
    break;
  case CONVERT_DNA:
    for (i = 0; (j = sg_todna[i][0]) != -1; i++) {
      table[j] = sg_todna[i][1];
    }
    break;
  case CONVERT_RNA:
    for (i = 0; (j = sg_torna[i][0]) != -1; i++) {
      table[j] = sg_torna[i][1];
    }
    break;
  default:
    break;
  }

  switch (type & CMASK_11000) {
  case CONVERT_TOUPPER:
    for (i = 0; i < TABLESIZE; i++) table[i] = toupper(table[i]);
    break;
  case CONVERT_TOLOWER:
    for (i = 0; i < TABLESIZE; i++) table[i] = tolower(table[i]);
    break;
  default:
    break;
  }

  return table;
} /* end of func */

int seq_convertall(SEQUENCE *p, int *table)
{
  int i;

  for (i = 0; i < p->seqlen; i++) (p->seq)[i] = table[(p->seq)[i]];
  return 0;
} /* end of func */

/* *************************************************************** */
/* functions for  reading FASTA sequence(s) */
static int read_fasta_custom(SEQUENCE *sp, FILE *fp, struct tmpMem *tp)
{
  int c;
  int proper_output = 1;
  struct tmpMem *tp2;
#define seq_putc(c) (tmpMem_putc(tp, (c)))
#define title_putc(c) (tmpMem_putc(tp, (c)))

  is_start_fasta(EOF); /* initialize */
  do {
    c = getc(fp);
    if (c == EOF) return EOF;
  } while (is_start_fasta(c) == 0);
  /* value of (c) is used by next step */

  tp2 = tmpMem_reset_full(tp, fp);
  if (tp2 == NULL) return -2;
  is_end_line(EOF); /* initialize */
  do {
    title_putc(c);
    c = getc(fp);
    if (c == EOF) {
      /* title_putc('\n'); */
      return EOF;
    }
  } while (is_end_line(c) == 0);
  /* title_putc('\n'); */
  sp->namelen = tmpMem_get_len(tp);
  title_putc('\0');
  sp->name = tmpMem_export_to_malloc(tp);
  /* tmpMem_reset(tp); */

  is_start_fasta(EOF); /* initialize */
  while ((c = getc(fp)) != EOF) {
    if (is_start_fasta(c)) {
      ungetc(c, fp);
      break; /* while */
    }
    if (is_proper_character(c)) {
      proper_output = 1; /* output data is correct */
      seq_putc(c);
    }
  } /* while */

  /* seq_putc('\n'); */
  if (proper_output) {
    sp->seqlen = tmpMem_get_len(tp);
    seq_putc('\0');
    sp->seq = tmpMem_export_to_malloc(tp);
    return 0;
  } else {
    return EOF;
  }
} /* end of func */

static int is_start_fasta(int c)
{
  static int newline = 1;

  if (c == EOF) { /* init all flags */
    newline = 1;
    return 0;
  }

  if (newline) {
    if (c == '>') {
      is_start_fasta(EOF); /* init */
      return 1;
    }
    if (iscntrl(c) == 0) newline = 0;
  }
  if (c == '\n' || c == '\r') newline = 1;

  return 0;
} /* end of func */

static int is_end_line(int c)
{
  if (c == '\n' || c == '\r') return 1;
  else return 0;
} /* end of func */

static int is_proper_character(int c)
{
  if (isalnum(c) || ispunct(c)) return 1;

  return 0;
}

