| 1 | #!/usr/local/bin/python |
|---|
| 2 | """ |
|---|
| 3 | :py:mod:`obiselect` : Select and print sequences identified by a list of IDs |
|---|
| 4 | ============================================================================ |
|---|
| 5 | |
|---|
| 6 | .. codeauthor:: Eric Coissac <eric.coissac@metabarcoding.org> |
|---|
| 7 | |
|---|
| 8 | :py:mod:`obiselect` command allows to select a set of sequences |
|---|
| 9 | Example: |
|---|
| 10 | |
|---|
| 11 | .. code-block:: bash |
|---|
| 12 | |
|---|
| 13 | > obiselect ... |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | """ |
|---|
| 17 | from obitools.format.options import addInOutputOption, sequenceWriterGenerator,\ |
|---|
| 18 | addInputFormatOption |
|---|
| 19 | from obitools.options import getOptionManager |
|---|
| 20 | from obitools.ecopcr.options import addTaxonomyDBOptions, loadTaxonomyDatabase |
|---|
| 21 | from random import random |
|---|
| 22 | import math |
|---|
| 23 | |
|---|
| 24 | def minimum(seqs): |
|---|
| 25 | return min(s['select'] for s in seqs) |
|---|
| 26 | |
|---|
| 27 | def maximum(seqs): |
|---|
| 28 | return max(s['select'] for s in seqs) |
|---|
| 29 | |
|---|
| 30 | def mean(seqs): |
|---|
| 31 | ss= reduce(lambda x,y: x + y,(s['select'] for s in seqs),0) |
|---|
| 32 | return float(ss) / len(seqs) |
|---|
| 33 | |
|---|
| 34 | def median(seqs): |
|---|
| 35 | ss = [s['select'] for s in seqs] |
|---|
| 36 | ss.sort() |
|---|
| 37 | return ss[len(ss)/2] |
|---|
| 38 | |
|---|
| 39 | |
|---|
| 40 | |
|---|
| 41 | def addSelectOptions(optionManager): |
|---|
| 42 | |
|---|
| 43 | optionManager.add_option('-c','--category-attribute', |
|---|
| 44 | action="append", dest="categories", |
|---|
| 45 | metavar="<Attribute Name>", |
|---|
| 46 | default=[], |
|---|
| 47 | help="Add one attribute to the list of" |
|---|
| 48 | " attribute used for categorizing sequences") |
|---|
| 49 | |
|---|
| 50 | optionManager.add_option('-n','--number', |
|---|
| 51 | action="store", dest="number", |
|---|
| 52 | metavar="", |
|---|
| 53 | type="int", |
|---|
| 54 | default=1, |
|---|
| 55 | help="select sequence in each group minimizing the function") |
|---|
| 56 | |
|---|
| 57 | optionManager.add_option('-f','--function', |
|---|
| 58 | action="store", dest="function", |
|---|
| 59 | metavar="", |
|---|
| 60 | default="random", |
|---|
| 61 | help="select sequence in each group minimizing the function") |
|---|
| 62 | |
|---|
| 63 | |
|---|
| 64 | optionManager.add_option('-m','--min', |
|---|
| 65 | action="store_const", dest="method", |
|---|
| 66 | metavar="", |
|---|
| 67 | default=maximum, |
|---|
| 68 | const=minimum, |
|---|
| 69 | help="select sequence in each group minimizing the function") |
|---|
| 70 | |
|---|
| 71 | optionManager.add_option('-M','--max', |
|---|
| 72 | action="store_const", dest="method", |
|---|
| 73 | metavar="<Attribute Name>", |
|---|
| 74 | default=maximum, |
|---|
| 75 | const=maximum, |
|---|
| 76 | help="select sequence in each group maximizing the function") |
|---|
| 77 | |
|---|
| 78 | optionManager.add_option('-a','--mean', |
|---|
| 79 | action="store_const", dest="method", |
|---|
| 80 | metavar="<Attribute Name>", |
|---|
| 81 | default=maximum, |
|---|
| 82 | const=mean, |
|---|
| 83 | help="select sequence in each group closest to the mean of the function") |
|---|
| 84 | |
|---|
| 85 | |
|---|
| 86 | optionManager.add_option('--median', |
|---|
| 87 | action="store_const", dest="method", |
|---|
| 88 | metavar="<Attribute Name>", |
|---|
| 89 | default=maximum, |
|---|
| 90 | const=median, |
|---|
| 91 | help="select sequence in each group closest to the median of the function") |
|---|
| 92 | |
|---|
| 93 | optionManager.add_option('--merge', |
|---|
| 94 | action="append", dest="merge", |
|---|
| 95 | metavar="<TAG NAME>", |
|---|
| 96 | type="string", |
|---|
| 97 | default=[], |
|---|
| 98 | help="attributes to merge") |
|---|
| 99 | |
|---|
| 100 | optionManager.add_option('--merge-ids', |
|---|
| 101 | action="store_true", dest="mergeids", |
|---|
| 102 | default=False, |
|---|
| 103 | help="add the merged id data to output") |
|---|
| 104 | |
|---|
| 105 | |
|---|
| 106 | |
|---|
| 107 | def sortclass(seqs,options): |
|---|
| 108 | cible = float(options.method(seqs)) |
|---|
| 109 | for s in seqs: |
|---|
| 110 | s['distance']=math.sqrt((float(s['select'])-cible)**2) |
|---|
| 111 | seqs.sort(lambda s1,s2 : cmp(s1['distance'],s2['distance'])) |
|---|
| 112 | |
|---|
| 113 | |
|---|
| 114 | |
|---|
| 115 | if __name__ == '__main__': |
|---|
| 116 | |
|---|
| 117 | optionParser = getOptionManager([addSelectOptions,addInOutputOption,addTaxonomyDBOptions]) |
|---|
| 118 | |
|---|
| 119 | (options, entries) = optionParser() |
|---|
| 120 | |
|---|
| 121 | taxonomy=loadTaxonomyDatabase(options) |
|---|
| 122 | |
|---|
| 123 | writer = sequenceWriterGenerator(options) |
|---|
| 124 | |
|---|
| 125 | classes = {} |
|---|
| 126 | |
|---|
| 127 | for s in entries: |
|---|
| 128 | category = [] |
|---|
| 129 | for c in options.categories: |
|---|
| 130 | try: |
|---|
| 131 | if hasattr(options, 'taxonomy') and options.taxonomy is not None: |
|---|
| 132 | environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()} |
|---|
| 133 | else: |
|---|
| 134 | environ = {'sequence':s,'random':random()} |
|---|
| 135 | |
|---|
| 136 | v = eval(c,environ,s) |
|---|
| 137 | category.append(v) |
|---|
| 138 | except: |
|---|
| 139 | category.append(None) |
|---|
| 140 | |
|---|
| 141 | category=tuple(category) |
|---|
| 142 | group = classes.get(category,[]) |
|---|
| 143 | group.append(s) |
|---|
| 144 | classes[category]= group |
|---|
| 145 | |
|---|
| 146 | if hasattr(options, 'taxonomy') and options.taxonomy is not None: |
|---|
| 147 | environ = {'taxonomy' : options.taxonomy,'sequence':s,'random':random()} |
|---|
| 148 | else: |
|---|
| 149 | environ = {'sequence':s} |
|---|
| 150 | |
|---|
| 151 | try: |
|---|
| 152 | select = eval(options.function,environ,s) |
|---|
| 153 | s['select']=select |
|---|
| 154 | except: |
|---|
| 155 | s['select']=None |
|---|
| 156 | |
|---|
| 157 | mergedKey = options.merge |
|---|
| 158 | mergeIds = options.mergeids |
|---|
| 159 | |
|---|
| 160 | if mergedKey is not None: |
|---|
| 161 | mergedKey=set(mergedKey) |
|---|
| 162 | else: |
|---|
| 163 | mergedKey=set() |
|---|
| 164 | |
|---|
| 165 | if taxonomy is not None: |
|---|
| 166 | mergedKey.add('taxid') |
|---|
| 167 | |
|---|
| 168 | |
|---|
| 169 | for c in classes: |
|---|
| 170 | seqs = classes[c] |
|---|
| 171 | sortclass(seqs, options) |
|---|
| 172 | if len(c)==1: |
|---|
| 173 | c=c[0] |
|---|
| 174 | |
|---|
| 175 | if options.number==1: |
|---|
| 176 | s = seqs[0] |
|---|
| 177 | |
|---|
| 178 | for key in mergedKey: |
|---|
| 179 | if key=='taxid' and mergeIds: |
|---|
| 180 | if 'taxid_dist' not in s: |
|---|
| 181 | s["taxid_dist"]={} |
|---|
| 182 | if 'taxid' in s: |
|---|
| 183 | s["taxid_dist"][s.id]=s['taxid'] |
|---|
| 184 | mkey = "merged_%s" % key |
|---|
| 185 | if mkey not in s: |
|---|
| 186 | s[mkey]={} |
|---|
| 187 | if key in s: |
|---|
| 188 | s[mkey][s[key]]=s[mkey].get(s[key],0)+1 |
|---|
| 189 | del(s[key]) |
|---|
| 190 | |
|---|
| 191 | if 'count' not in s: |
|---|
| 192 | s['count']=1 |
|---|
| 193 | if mergeIds: |
|---|
| 194 | s['merged']=[s.id] |
|---|
| 195 | |
|---|
| 196 | for seq in seqs[1:]: |
|---|
| 197 | |
|---|
| 198 | s['count']+=seq['count'] |
|---|
| 199 | |
|---|
| 200 | for key in mergedKey: |
|---|
| 201 | if key=='taxid' and mergeIds: |
|---|
| 202 | if 'taxid_dist' in seq: |
|---|
| 203 | s["taxid_dist"].update(seq["taxid_dist"]) |
|---|
| 204 | if 'taxid' in seq: |
|---|
| 205 | s["taxid_dist"][seq.id]=seq['taxid'] |
|---|
| 206 | |
|---|
| 207 | mkey = "merged_%s" % key |
|---|
| 208 | if key in seq: |
|---|
| 209 | s[mkey][seq[key]]=s[mkey].get(seq[key],0)+1 |
|---|
| 210 | if mkey in seq: |
|---|
| 211 | for skey in seq[mkey]: |
|---|
| 212 | if skey in s: |
|---|
| 213 | s[mkey][skey]=s[mkey].get(seq[skey],0)+seq[mkey][skey] |
|---|
| 214 | else: |
|---|
| 215 | s[mkey][skey]=seq[mkey][skey] |
|---|
| 216 | |
|---|
| 217 | for key in seq.iterkeys(): |
|---|
| 218 | # Merger proprement l'attribut merged s'il exist |
|---|
| 219 | if key in s and s[key]!=seq[key] and key!='count' and key[0:7]!='merged_' and key!='merged': |
|---|
| 220 | del(s[key]) |
|---|
| 221 | |
|---|
| 222 | |
|---|
| 223 | if mergeIds: |
|---|
| 224 | s['merged'].append(seq.id) |
|---|
| 225 | |
|---|
| 226 | if taxonomy is not None: |
|---|
| 227 | mergeTaxonomyClassification(seqs, taxonomy) |
|---|
| 228 | |
|---|
| 229 | |
|---|
| 230 | |
|---|
| 231 | |
|---|
| 232 | |
|---|
| 233 | |
|---|
| 234 | |
|---|
| 235 | for s in seqs[0:options.number]: |
|---|
| 236 | s['class']=c |
|---|
| 237 | writer(s) |
|---|