001/* 002 * BioJava development code 003 * 004 * This code may be freely distributed and modified under the 005 * terms of the GNU Lesser General Public Licence. This should 006 * be distributed with the code. If you do not have a copy, 007 * see: 008 * 009 * http://www.gnu.org/copyleft/lesser.html 010 * 011 * Copyright for this code is held jointly by the individual 012 * authors. These should be listed in @author doc comments. 013 * 014 * For more information on the BioJava project and its aims, 015 * or to join the biojava-l mailing list, visit the home page 016 * at: 017 * 018 * http://www.biojava.org/ 019 * 020 */ 021package org.biojava.nbio.core.sequence.storage; 022 023import org.biojava.nbio.core.sequence.AccessionID; 024import org.biojava.nbio.core.sequence.compound.NucleotideCompound; 025import org.biojava.nbio.core.sequence.template.CompoundSet; 026import org.biojava.nbio.core.sequence.template.Sequence; 027 028import java.util.ArrayList; 029import java.util.HashMap; 030import java.util.List; 031import java.util.Map; 032 033/** 034 * Implementation of the 2bit encoding. This will default to the following 035 * encodings: 036 * 037 * <ul> 038 * <li>0 - T</li> 039 * <li>1 - C</li> 040 * <li>2 - A</li> 041 * <li>3 - G</li> 042 * </ul> 043 * 044 * We also do not support case sensitive encodings therefore if you pass a 045 * lowercased a this will be treated as if it is an uppercase A and we will 046 * erase that information. 047 * 048 * @author ayates 049 */ 050public class TwoBitSequenceReader<C extends NucleotideCompound> extends BitSequenceReader<C> { 051 052 public TwoBitSequenceReader(Sequence<C> sequence) { 053 super(new TwoBitArrayWorker<C>(sequence), sequence.getAccession()); 054 } 055 056 public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet) { 057 this(sequence, compoundSet, new AccessionID("Unknown")); 058 } 059 060 public TwoBitSequenceReader(String sequence, CompoundSet<C> compoundSet, AccessionID accession) { 061 super(new TwoBitArrayWorker<C>(sequence, compoundSet), accession); 062 } 063 064 public TwoBitSequenceReader(TwoBitArrayWorker<C> worker) { 065 super(worker, new AccessionID("unknown")); 066 } 067 068 public TwoBitSequenceReader(TwoBitArrayWorker<C> worker, AccessionID accession) { 069 super(worker, accession); 070 } 071 072 /** 073 * Extension of the BitArrayWorker which provides the 2bit implementation 074 * code. This is intended to work with the 4 basic nucelotide types. If you 075 * require a different version of the encoding used here then extend 076 * and override as required. 077 * 078 * @param <C> Must extend NucleotideCompound 079 */ 080 public static class TwoBitArrayWorker<C extends NucleotideCompound> extends BitArrayWorker<C> { 081 082 public TwoBitArrayWorker(CompoundSet<C> compoundSet, int length) { 083 super(compoundSet, length); 084 } 085 086 public TwoBitArrayWorker(CompoundSet<C> compoundSet, int[] sequence) { 087 super(compoundSet, sequence); 088 } 089 090 public TwoBitArrayWorker(Sequence<C> sequence) { 091 super(sequence); 092 } 093 094 public TwoBitArrayWorker(String sequence, CompoundSet<C> compoundSet) { 095 super(sequence, compoundSet); 096 } 097 098 /** 099 * Masking value used for extracting the right most 2 bits from a byte 100 */ 101 private final static byte MASK = (byte) ((int) Math.pow(2, 0) | (int) Math.pow(2, 1)); 102 103 @Override 104 protected byte bitMask() { 105 return MASK; 106 } 107 108 @Override 109 protected int compoundsPerDatatype() { 110 return 16; 111 } 112 113 /** 114 * Returns a Map which encodes TCAG into positions 0,1,2,3. 115 */ 116 @Override 117 @SuppressWarnings("serial") 118 protected Map<C, Integer> generateCompoundsToIndex() { 119 final CompoundSet<C> cs = getCompoundSet(); 120 return new HashMap<C, Integer>() { 121 122 { 123 put(cs.getCompoundForString("T"), 0); 124 put(cs.getCompoundForString("C"), 1); 125 put(cs.getCompoundForString("A"), 2); 126 put(cs.getCompoundForString("G"), 3); 127 put(cs.getCompoundForString("t"), 0); 128 put(cs.getCompoundForString("c"), 1); 129 put(cs.getCompoundForString("a"), 2); 130 put(cs.getCompoundForString("g"), 3); 131 } 132 }; 133 } 134 135 /** 136 * Returns a List which encodes TCAG into positions 0,1,2,3. 137 */ 138 @Override 139 protected List<C> generateIndexToCompounds() { 140 CompoundSet<C> cs = getCompoundSet(); 141 List<C> result = new ArrayList<C>(); 142 result.add( cs.getCompoundForString("T")); 143 144 145 result.add( cs.getCompoundForString("C")); 146 result.add( cs.getCompoundForString("A")); 147 result.add( cs.getCompoundForString("G")); 148 return result; 149 } 150 } 151 152}