How can make a motif into a regular expression?
One of the interesting things you can do with BioJava's MotifTools is to make a Sequence into a Java regular expression Pattern. You can then use this Pattern to search Strings for the existance of that Pattern. The generated Pattern can even be from an ambiguous sequence such as "acgytnwacrs"
The following example from Andy Hammer demonstrates how this can be used to search Sequences for Motifs.
/**
* MotifLister.java
* Modified slightly from the original by Andy Hammer
*
* Lists all instances of a motif in specified (dna\rna\protein) fasta file.
* The motif can contain Ambiguity symbols
* Lists the ORF title and position of motif
* Outputs a list of counts to stdout.
*/
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.biojava.bio.BioError;
import org.biojava.bio.BioException;
import org.biojava.bio.seq.DNATools;
import org.biojava.bio.seq.ProteinTools;
import org.biojava.bio.seq.RNATools;
import org.biojava.bio.seq.Sequence;
import org.biojava.bio.seq.SequenceIterator;
import org.biojava.bio.seq.io.SeqIOTools;
import org.biojava.bio.symbol.MotifTools;
import org.biojava.bio.symbol.SymbolList;
public class MotifLister{
private SymbolList motif;
private int frame;
private int count;
private SequenceIterator si;
public MotifLister(String type, String inputFile,
String target, String placement)throws Exception{
System.out.println("MotifLister is searching file " + inputFile +
" for the motif '" + target +
"' in frame " + placement + ".");
try{
if(type.equalsIgnoreCase("dna")){
motif = DNATools.createDNA(target);
}else if(type.equalsIgnoreCase("rna")){
motif = RNATools.createRNA(target);
}else{
motif = ProteinTools.createProtein(target);
}
}
catch(BioError e){
System.out.println("Error!! Data type must match type of motif.");
System.out.println("Specifically, " + target + " is not " + type);
System.exit(0);
}
frame = Integer.parseInt(placement);
if (frame < 0 || frame > 3) {
System.out.println("Only frames 0 through 3 are allowed");
System.out.println("frame zero searches all frames.");
System.exit(0);
}
//make a regex expression for the SymbolList using MotifTools
Pattern p = Pattern.compile( MotifTools.createRegex(motif) );
count = 0;
//read the input
FileInputStream fis = new FileInputStream(inputFile);
InputStreamReader isr = new InputStreamReader(fis);
BufferedReader input = new BufferedReader(isr);
try{
si = (SequenceIterator)SeqIOTools.fileToBiojava("fasta", "type", input);
//for each sequence
while (si.hasNext()){
Sequence seq = si.nextSequence();
//get the regex matcher for the pattern
Matcher matcher = p.matcher(seq.seqString());
int start = 0;
//find the next match from start
while(matcher.find(start)) {
start = matcher.start();
int end = matcher.end();
int result = (start % 3) + 1;
if(result == frame || frame == 0){
//print the match location
System.out.println(seq.getName() + " : " +
"[" + (start + 1) + "," + (end) + "]");
count++;
}
start++;
}
}
input.close(); //close the file
System.out.println("Total Hits = " + count);
}
catch(BioException e){
System.out.println(inputFile + " is not a " + type + " file.");
System.out.println(e);
}
}
public static void main(String[] args)throws Exception{
if (args.length < 4) {
System.err.println(" Usage: >java -jar MotifLister.jar type fastaFile motif frame" +
"\n Ex: >java -jar MotifLister.jar dna eColi.fasta AAAAAAG 3 > output.txt" +
"\n would search for A AAA AAG in the third frame in dna file eColi.fasta" +
"\n and print the results to file output.txt." +
"\n 'type' can be dna, rna, or protein." +
"\n 'frame' can be integers 0 through 3." +
"\n 0 counts any instance of the motif." +
"\n 1, 2, 3 counts only instances of the motif in the specified frame." +
"\n Capture output with redirection operator '>'.");
}else{
MotifLister ML = new MotifLister(args[0], args[1], args[2], args[3]);
}
}
}