Parsing FASTA Defline, and extract IDs and other informations. IDs are NSIDs (NCBI standard FASTA sequence identifiers) or “:”-separated IDs.
specs are described in: ftp.ncbi.nih.gov/blast/documents/README.formatdb blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]') rub.entry_id ==> 'gi|671595' rub.get('emb') ==> 'CAA85678.1' rub.emb ==> 'CAA85678.1' rub.gi ==> '671595' rub.accession ==> 'CAA85678' rub.accessions ==> [ 'CAA85678' ] rub.acc_version ==> 'CAA85678.1' rub.locus ==> nil rub.list_ids ==> [["gi", "671595"], ["emb", "CAA85678.1", nil], ["Perovskia abrotanoides"]] ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]") ckr.entry_id ==> "gi|2495000" ckr.sp ==> "CCKR_CAVPO" ckr.pir ==> "I51898" ckr.gb ==> "AAB29504.1" ckr.gi ==> "2495000" ckr.accession ==> "AAB29504" ckr.accessions ==> ["Q63931", "AAB29504"] ckr.acc_version ==> "AAB29504.1" ckr.locus ==> nil ckr.description ==> "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)" ckr.descriptions ==> ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)", "cholecystokinin A receptor - guinea pig", "cholecystokinin A receptor; CCK-A receptor [Cavia]"] ckr.words ==> ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig", "receptor", "type"] ckr.id_strings ==> ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898", "544724", "AAB29504.1", "Cavia"] ckr.list_ids ==> [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"], ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"], ["gb", "AAB29504.1", nil], ["Cavia"]]
Fasta format description (NCBI) www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.) (Dead link. Please find in web.archive.org/ ). blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
Program Parameters for formatdb and fastacmd (by Tao Tao) www.ncbi.nlm.nih.gov/staff/tao/URLAPI/formatdb_fastacmd.html#t1.1
Formatdb README ftp.ncbi.nih.gov/blast/documents/formatdb.html
Shows accession with version number. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 494 def acc_version unless defined?(@acc_version) then @acc_version = get_by_type('acc_version') end @acc_version end
Shows an accession number.
# File lib/bio/db/fasta/defline.rb, line 512 def accession unless defined?(@accession) then if acc_version then @accession = acc_version.split('.')[0] else @accession = accessions[0] end end @accession end
Shows accession numbers. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 503 def accessions unless defined?(@accessions) then @accessions = get_all_by_type('accession', 'acc_version') @accessions.collect! { |x| x.sub(/\..*\z/, '') } end @accessions end
Parses given string and adds parsed data.
# File lib/bio/db/fasta/defline.rb, line 195 def add_defline(str) case str when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/ # NSIDs # examples: # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P # # note: regexp (:?) means grouping without backreferences i = $1 d = $2 tks = i.split('|') tks << '' if i[-1,1] == '|' a = parse_NSIDs(tks) i = a[0].join('|') a.unshift('|') d = tks.join('|') + ' ' + d unless tks.empty? a << d this_line = a match_EC(d) parse_square_brackets(d).each do |x| if !match_EC(x, false) and x =~ /\A[A-Z]/ then di = [ x ] @list_ids << di @info['organism'] = x unless @info['organism'] end end when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/ # examples: # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST] # >emb:CACDC28 [X80034] C.albicans CDC28 gene i = $1 d = $2 a = parse_ColonSepID(i) i = a.join(':') this_line = [ ':', a , d ] match_EC(d) parse_square_brackets(d).each do |x| if !match_EC(x, false) and x =~ /:/ then parse_ColonSepID(x) elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then @list_ids << [ $1 ] end end when /^\>?\s*(\S+)(?:\s+(.+))?$/ # examples: # >ABC12345 this is test i = $1 d = $2.to_s @list_ids << [ i.chomp('.') ] this_line = [ '', [ i ], d ] match_EC(d) else i = str d = '' match_EC(i) this_line = [ '', [ i ], d ] end @deflines << this_line @entry_id = i unless @entry_id end
Shows description.
# File lib/bio/db/fasta/defline.rb, line 337 def description @deflines[0].to_a[-1] end
Returns descriptions.
# File lib/bio/db/fasta/defline.rb, line 342 def descriptions @deflines.collect do |a| a[-1] end end
Returns identifires by a database name.
# File lib/bio/db/fasta/defline.rb, line 418 def get(dbname) db = dbname.to_s r = nil unless r = @info[db] then di = @list_ids.find { |x| x[0] == db.to_s } if di and di.size <= 2 then r = di[-1] elsif di then labels = self.class::NSIDs[db] [ 'acc_version', 'entry_id', 'locus', 'accession', 'number'].each do |x| if i = labels.index(x) then r = di[i+1] break if r end end r = di[1..-1].find { |x| x } unless r end @info[db] = r if r end r end
Returns identifiers by given type.
# File lib/bio/db/fasta/defline.rb, line 454 def get_all_by_type(*type_strarg) d = [] @list_ids.each do |x| if labels = self.class::NSIDs[x[0]] then type_strarg.each do |y| if i = labels.index(y) then d << x[i+1] if x[i+1] end end end end d end
Returns an identifier by given type.
# File lib/bio/db/fasta/defline.rb, line 442 def get_by_type(type_str) @list_ids.each do |x| if labels = self.class::NSIDs[x[0]] then if i = labels.index(type_str) then return x[i+1] end end end nil end
Shows GI. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 483 def gi unless defined?(@gi) then @gi = get_by_type('gi') end @gi end
Shows ID-like strings. Returns an array of strings.
# File lib/bio/db/fasta/defline.rb, line 350 def id_strings r = [] @list_ids.each do |a| if a.size >= 2 then r.concat a[1..-1].find_all { |x| x } else if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/ r << a[0] end end end r.concat( words(true, []).find_all do |x| x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/ end) r end
Shows locus. If the entry has more than two of such IDs, only the first ID are shown. Returns a string or nil.
# File lib/bio/db/fasta/defline.rb, line 472 def locus unless defined?(@locus) @locus = get_by_type('locus') end @locus end
# File lib/bio/db/fasta/defline.rb, line 523 def method_missing(name, *args) # raise ArgumentError, # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2 r = get(name, *args) if !r and !(self.class::NSIDs[name.to_s]) then raise "NameError: undefined method `#{name.inspect}'" end r end
Shows original string. Note that the result of this method may be different from original string which is given in FastaDefline.new method.
# File lib/bio/db/fasta/defline.rb, line 329 def to_s @deflines.collect { |a| s = a[0] (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip }.join("\x01") end
Shows words used in the defline. Returns an Array.
# File lib/bio/db/fasta/defline.rb, line 392 def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray, kwhash = self.class::KillWordsHash) a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/) a.collect! do |x| x.sub!(/\A[\$\*\-\+]+/, '') x.sub!(/[\$\*\-\=]+\z/, '') if x.size <= 1 then nil elsif kwhash[x.downcase] then nil else if kill_regexp.find { |expr| expr =~ x } then nil else x end end end a.compact! a.collect! { |x| x.downcase } unless case_sensitive a.sort! a.uniq! a end
Generated with the Darkfish Rdoc Generator 2.