Codebase list slib / cme/main ncbi-dna.scm
cme/main

Tree @cme/main (Download .tar.gz)

ncbi-dna.scm @cme/mainraw · history · blame

;;;; "ncbi-dna.scm" Read and manipulate NCBI-format nucleotide sequences
;;; Copyright (C) 2003 Aubrey Jaffer
;
;Permission to copy this software, to modify it, to redistribute it,
;to distribute modified versions, and to use it for any purpose is
;granted, subject to the following restrictions and understandings.
;
;1.  Any copy made of this software must include this copyright notice
;in full.
;
;2.  I have made no warranty or representation that the operation of
;this software will be error-free, and I am under no obligation to
;provide any services, by way of maintenance, update, or otherwise.
;
;3.  In conjunction with products arising from the use of this
;material, there shall be no use of my name in any advertising,
;promotional, or sales literature without prior written consent in
;each case.

(require 'array)
(require 'scanf)
(require 'string-case)
(require 'string-search)
(require 'array-for-each)
(require 'multiarg-apply)
(require-if 'compiling 'printf)	;used by cDNA:report-base-count

;;@code{(require 'ncbi-dma)}
;;@ftindex ncbi-dma

(define (ncbi:read-DNA-line port)
  (define lst (scanf-read-list
	       " %d %[acgt] %[acgt] %[acgt] %[acgt] %[acgt] %[acgt]" port))
  (cond ((or (null? lst) (eof-object? lst)) #f)
	((not (eqv? 1 (modulo (car lst) 60)))
	 (slib:warn 'bad 'idx lst) #f)
	(else (apply string-append (cdr lst)))))

;;@body
;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN}
;;from @1.
(define (ncbi:read-DNA-sequence port)
  (find-string-from-port? "ORIGIN" port)
  (find-string-from-port? (string #\newline) port)
  (do ((lne (ncbi:read-DNA-line port) (ncbi:read-DNA-line port))
       (lns '() (cons lne lns)))
      ((not lne) (apply string-append (reverse lns)))))

;;@body
;;Reads the NCBI-format DNA sequence following the word @samp{ORIGIN}
;;from @1.
(define (ncbi:read-file file)
  (call-with-input-file file ncbi:read-DNA-sequence))

;;@body
;;Replaces @samp{T} with @samp{U} in @1
(define (mRNA<-cDNA str)
  (array-for-each
   (lambda (chr)
     (case chr
       ((#\a) #\a)
       ((#\t) #\u)
       ((#\c) #\c)
       ((#\g) #\g)
       ((#\A) #\A)
       ((#\T) #\U)
       ((#\C) #\C)
       ((#\G) #\G)
       (else chr)))
   str))

(define cDNA:codons
  '((TTT phe #\F) (TCT ser #\S) (TAT tyr #\Y) (TGT cys #\C)
    (TTC phe #\F) (TCC ser #\S) (TAC tyr #\Y) (TGC cys #\C)
    (TTA leu #\L) (TCA ser #\S) (TAA)         (TGA) ;stops
    (TTG leu #\L) (TCG ser #\S) (TAG)         (TGG trp #\W)
    (CTT leu #\L) (CCT pro #\P) (CAT his #\H) (CGT arg #\R)
    (CTC leu #\L) (CCC pro #\P) (CAC his #\H) (CGC arg #\R)
    (CTA leu #\L) (CCA pro #\P) (CAA gln #\Q) (CGA arg #\R)
    (CTG leu #\L) (CCG pro #\P) (CAG gln #\Q) (CGG arg #\R)
    (ATT ile #\I) (ACT thr #\T) (AAT asn #\N) (AGT ser #\S)
    (ATC ile #\I) (ACC thr #\T) (AAC asn #\N) (AGC ser #\S)
    (ATA ile #\I) (ACA thr #\T) (AAA lys #\K) (AGA arg #\R)
    (ATG met #\M) (ACG thr #\T) (AAG lys #\K) (AGG arg #\R)
    (GTT val #\V) (GCT ala #\A) (GAT asp #\D) (GGT gly #\G)
    (GTC val #\V) (GCC ala #\A) (GAC asp #\D) (GGC gly #\G)
    (GTA val #\V) (GCA ala #\A) (GAA glu #\E) (GGA gly #\G)
    (GTG val #\V) (GCG ala #\A) (GAG glu #\E) (GGG gly #\G)))

;;@body
;;Returns a list of three-letter symbol codons comprising the protein
;;sequence encoded by @1 starting with its first occurence of
;;@samp{atg}.
(define (codons<-cDNA cDNA)
  (define len (string-length cDNA))
  (define start #f)
  (set! start (substring-ci? "atg" cDNA))
  (if (not start) (slib:warn 'missed 'start))
  (let loop ((protein '(*N*))
	     (cdx (or start 0)))
    (if (<= len cdx) (slib:error 'reached 'end cdx))
    (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
      (define asc (assq codon cDNA:codons))
      (cond ((not asc)
	     (slib:warn 'mystery 'codon codon)
	     (reverse (cons '*C* protein)))
	    ((null? (cdr asc)) (reverse (cons '*C* protein)))
	    (else (loop (cons codon protein) (+ 3 cdx)))))))

;;@body
;;Returns a list of three-letter symbols for the protein sequence
;;encoded by @1 starting with its first occurence of @samp{atg}.
(define (protein<-cDNA cDNA)
  (define len (string-length cDNA))
  (define start #f)
  (set! start (substring-ci? "atg" cDNA))
  (if (not start) (slib:warn 'missed 'start))
  (let loop ((protein '(*N*))
	     (cdx (or start 0)))
    (if (<= len cdx) (slib:error 'reached 'end cdx))
    (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
      (define asc (assq codon cDNA:codons))
      (cond ((not asc)
	     (slib:warn 'mystery 'codon codon)
	     (reverse (cons '*C* protein)))
	    ((null? (cdr asc)) (reverse (cons '*C* protein)))
	    (else (loop (cons (cadr asc) protein) (+ 3 cdx)))))))

;;@body
;;Returns a string of one-letter amino acid codes for the protein
;;sequence encoded by @1 starting with its first occurence of
;;@samp{atg}.
(define (P<-cDNA cDNA)
  (define len (string-length cDNA))
  (define start #f)
  (set! start (substring-ci? "atg" cDNA))
  (if (not start) (slib:warn 'missed 'start))
  (let loop ((protein '())
	     (cdx (or start 0)))
    (if (<= len cdx) (slib:error 'reached 'end cdx))
    (let ((codon (string-ci->symbol (substring cDNA cdx (+ 3 cdx)))))
      (define asc (assq codon cDNA:codons))
      (cond ((not asc) (slib:error 'mystery 'codon codon))
	    ((null? (cdr asc)) (list->string (reverse protein)))
	    (else (loop (cons (caddr asc) protein) (+ 3 cdx)))))))

;;@
;;These cDNA count routines provide a means to check the nucleotide
;;sequence with the @samp{BASE COUNT} line preceding the sequence from
;;NCBI.

;;@body
;;Returns a list of counts of @samp{a}, @samp{c}, @samp{g}, and
;;@samp{t} occurrencing in @1.
(define (cDNA:base-count cDNA)
  (define cnt:a 0)
  (define cnt:c 0)
  (define cnt:g 0)
  (define cnt:t 0)
  (array-for-each (lambda (chr)
		    (case chr
		      ((#\a #\A) (set! cnt:a (+ 1 cnt:a)))
		      ((#\c #\C) (set! cnt:c (+ 1 cnt:c)))
		      ((#\g #\G) (set! cnt:g (+ 1 cnt:g)))
		      ((#\t #\T) (set! cnt:t (+ 1 cnt:t)))
		      (else (slib:error 'cDNA:base-count 'unknown 'base chr))))
		  cDNA)
  (list cnt:a cnt:c cnt:g cnt:t))

;;@body
;;Prints the counts of @samp{a}, @samp{c}, @samp{g}, and @samp{t}
;;occurrencing in @1.
(define (cDNA:report-base-count cDNA)
  (require 'printf)
  (apply printf "BASE COUNT   %6d a %6d c %6d g %6d t\\n"
	 (cDNA:base-count cDNA)))