-
Notifications
You must be signed in to change notification settings - Fork 0
/
pcapipeline.py
2724 lines (2318 loc) · 122 KB
/
pcapipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
@author: Elizabeth Watkins
This script is a pipeline to perform PCA using a given PCA method on subtracted
background data to form a PCA library and is also a wrapper to apply the PCA
library to similar data to remove subtraction residuals.
Since it is expected to be run on optical spectra to remove sky subtraction
residuals from science data, the naming conventions of variables used in
this pipeline reflect this.
Code requires that numpy, scipy have been installed.
Requires the robust pca and gappy reconstruction scripts.
Copyright (c) 2023, Elizabeth Jayne Watkins
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
"""
#Need a function/set of functions that stacks all the spectra into 2d array. This will involve loading in spectra and then stacking them together. Will need path management. This should then be saved as a file
#Need a function that takes in skycorr results and outputs a total skyline mask
#dictionary for skycorr functions.
import warnings
import robustpca as pca
import gappyrecon as gappy
import numpy as np
import copy
import utilitymasking as util
import configpca as config
def _pop_pca_kwargs(kwargs):
"""Separates out kwargs for the method and kwargs for the PCA
Parameters
----------
kwargs : TYPE
DESCRIPTION.
Returns
-------
kwargs : TYPE
DESCRIPTION.
separate_kwargs : TYPE
DESCRIPTION.
"""
separate_kwargs = {}
for key in config.pca_kwargs.keys():
separate_kwargs[key] = kwargs.pop(key, config.pca_kwargs[key])
return kwargs, separate_kwargs
# (377,)
# (377, 100)
# print(mean_prev.shape)
# print(eigen_vectors.shape)
# ================= Default functions ======================
def project_spectra(spectra, errors, eigen_vectors, mean_spectra,
namplitudes=5, run_mask=False, return_extra=False):
"""Default function to reconstruct a spectra using the `gappy` method.
This routine nicely deals with 'missing data' (such as if there is
a science line present at a skyline location, and so part of the skyline
has been masked out where the science line is, or if some pixels read
out badly). It does this by filling the gap with a reconstruction of the
data at this location by solving the data normalisation. This has the
advantage of keeping the pca euclidean space exact, which minimises
some artefacts, but comes at the cost that the reconstructions are not
fully orthogonal any more. It is recommended to ALWAYS reconstruct using
this function, unless an alternative reconstruction method is used
that also implements smart handling of data gaps.
Parameters
----------
spectra : numpy.ndarray
The spectra that will be used to make the reconstruction
errors : numpy.ndarray
Errors. Just needs to be 0's where to ignore the data and 1's
everywhere else.
eigen_vectors : numpy.ndarray
The sky library eigen vectors.
mean_spectra : numpy.1darray
The mean spectra.
namplitudes : int, optional
The number of components used to reconstruct. The default is 5.
get_run_mask : bool, optional
Whether to apply the mask where reconstruction performed on. The
default is False.
return_extra : bool, optional
If `return_extra` is True extra parameter `used_spectra` is returned
or the extra parameter is None if `get_run_mask` is None. The default
is False.
Returns
-------
reconstructed_spectra : numpy.ndarray
The reconstructed spectra.
"""
if namplitudes == 0:
return spectra
e1_to_construct = eigen_vectors[:,:namplitudes]
spectra, errors = [util.check_make_2d(ar) for ar in [spectra, errors]]
if run_mask:
final_spectra = np.zeros_like(spectra)
# gappy.run_normgappy is vectorised. array[None], just adds a dummy dimension
# so that we can run the code (array.shape = [N] array[None].shape = [1,N])
scores, norm, used_spectra = gappy.run_normgappy(errors, spectra, mean_spectra, e1_to_construct, return_run_mask=run_mask)
reconstructed_spectra = (scores @ e1_to_construct.T) + mean_spectra
final_spectra[used_spectra] = reconstructed_spectra
final_spectra[~used_spectra] = spectra[~used_spectra]
else:
scores, norm = gappy.run_normgappy(errors, spectra, mean_spectra, e1_to_construct)
final_spectra = (scores @ e1_to_construct.T) + mean_spectra
if return_extra:
if run_mask:
return final_spectra, used_spectra
else:
return final_spectra, None
else:
return final_spectra
def get_rms(spectra, rms_mask, where_data_good_and_errors, rms_percentile=90):
"""Default function to find the rms of the data robustly by using a
percentile value rather than the raw mean (i.e., if `rms_percentile`=50
it would be the root median squared). Default value is 90th percentile
of the rms values to minimise outliers, as described in Wild et al.
Parameters
----------
spectra : numpy.ndarray
The spectra.
rms_mask : numpy.ndarray
Which pixels from the spectra are being used for the the rms. Not the
same as where the data is good or not. Only part of the data
for the rms (i.e., on or sky) is needed
where_data_good_and_errors : numpy.ndarray
0's where the data is bad. The rest contain the error values.
rms_percentile : float or int, optional
The percentile above which the rms is calculated from. The default is
90.
Returns
-------
rms_per_specta : numpy.ndarray
The rms value for each spectra.
"""
#Ignore outliers indicated in the error spectra that might not
#have been indicated yet
# where_data_good[error_spectra==0] = 0
data_over_error = (spectra / where_data_good_and_errors)
where_data_good = copy.deepcopy(where_data_good_and_errors)
where_data_good[where_data_good !=0] = 1
where_data_good = where_data_good.astype(bool)
rms_mask = rms_mask.astype(bool)
data_over_error[~where_data_good] = np.nan
data_over_error[~rms_mask] = np.nan
#check
rms_per_specta = np.nanpercentile(data_over_error, rms_percentile, axis=1)
return rms_per_specta
#!!!NOTES for doc
#we used the median Poisson noise of the skypixels in older sdss because the
#sky was next to the object, and represented the Poisson noise that the sky
#gave independent of the science. We do not have this is LVM. We have
#a sky that is far away that is used to model the sky at the science
#The issue is that the Poisson noise from the sky telescopes are going to be very different
#to the science.
#With skycorr, and the fact it gives us a model sky in the science
# it has separated out the sky noise from the object noise. And so
#Vivian used the skycorr model as a proxy for the Poisson noise.
#So when making the sky library I can normalise by the error arrays but this
#would not keep things consistent, so I'll also need to normalise by the final
#skycorr model of the sky spectra
#Need to ask what is kept in the skymodel. It should be all the
#sky emission that needs to be subtracted
class pcaPrep():
"""Description: This class is a wrapper for the preparation needed to build
a PCA library and/or before applying a PCA library to identify and remove
correlated noise/outlier features present throughout different instances
of an observation set.
...
Attributes
----------
prep_functions : dict
Dictionary containing the preprocessing functions with the following
keys:
`'continuum'`
`'normalise'`
`'skylines'`
`'science_lines'`
`'outliers'`
sky_spectra : numpy.ndarray
The final calibrated, sky subtracted spectra that contain sky
subtraction residuals.
noise_gen_data : numpy.ndarray
Data/spectra that will be used to (estimate/)get the Poisson
noise and normalise the spectra with.
observation_numbers : numpy.1darray
Iterable ID numbers of the spectra to preserve knowledge of
where they came from. Some analysis/prep requires keeping track
of the observation set etc.
skyline_mask : numpy.1darray, optional
If known, the position of skylines to perform the PCA analysis
upon. This 1d array applies to all spectra The default is None.
error_spectra : numpy.ndarray, optional
If the phyiscal error arrays of the spectra are different to the
array needed when normalising the spectra by its Poisson noise, add
the noise array using this variable (needed for skycorr). The
default is None.
noise_gen_is_error : bool, optional
If the error error is the same as the data needed to normalise
the spectra using the Poisson noise, set this to True, else
leave as default or as False. The default is False.
verbose : bool, optional
If True, prints out updates as the class runs its methods.
The default is True.
TODO: ADD print statements logs, and diagnostic plots
processed_spectra : dict
Dictionary stores the result of each preprocessing step
method_kwargs : dict or None, optional
If additional variables are needed when running the preprocessing,
they can be provided as a dictionary containing these
variables as a dictionary. For example, if `get_science_lines` and
'get_outliers' needs additional variables (e.g. an array containing
all known optical lines to help with science line identification,
for get_science_lines, or to use non-default option in the
outlier method provided, such as sigma=5, MAD=True)
provide it as:
method_kwargs = {
'science_lines' : {
'all_optical':`YOUR_ARRAY`
},
'outliers' : {
'sigma' : 5
'MAD' : True
}
}
The default is None.
_start : str
The default name of the default starting method for the preprocessing.
The default is 'continuum'
_end : str
The default name of the default final method of the preprocessing.
The default is 'outliers'
Methods
-------
get_continuum(full=False, **kwargs)
Performs continuum subtraction on the spectra. If full is `True`,
the continuum found before the subtraction is also returned.
get_normalise(self, full=False, **kwargs)
Normalises the spectra. If full is `True`, the normalisation used is
also returned.
get_skylines(, **kwargs)
Gets the skyline mask as a 1d array. This is a mask for the data that
one wants to USE, so 1's/True are set at skyline locations, and 0's/
Falses at positions that do not contain skylines
get_science_lines(mask=None, **kwargs)
Method for finding and masking science lines as a 2d array. To ensure
that skylines, or other known common features throughout the spectra
are not accidentally identified as science lines, either a mask at these
positions needs to be provided or `get_skylines` needs to be run
beforehand and be saved within the attribute `processed_spectra`. Since
this mask identifies pixels we DO NOT want the PCA to see,
science line locations are set to 0 (False) and the remaining usable
positions are set to 1 (True).
get_outliers(mask=None, science_line_masks=None, **kwargs)
Identifies where an entire spectrum is statistically an outlier
and masks the entire spectrum. This can be either a 1d array
at the positions to ignore, or can be a 2d array where the pixels
for the bad spectrum are all masked. To ensure that skylines, or other
known common features throughout the spectra do not impact the spectra
rejection criteria, these locations need to be provided, or need to
have already been generated by the object before running this method.
!!!WARNING: Individual outlier pixels are not meant to be masked since these
are part of the noise statistics that the PCA needs to make the eigen
spectra. What we need to mask are spectra that overall look bad and need
to be excluded from further analysis.
run(start=None, end=None, method_kwargs=None)
Convenience function to run the default class methods while
defining the start and stop of the methods to run. These methods
are (in order):
`'continuum'`
`'normalise'`
`'skylines'`
`'science_lines'`
`'outliers'`
autorun(autorun):
If autorun has been set at class initialisation, class
will automatically run the analysis. If the start and/or end
have been provided when initialising, these will be used, else
it automatically runs all the prep methods in order. These are:
`continuum`
`normalise`
`skylines`
`science_lines`
`outliers`
Purpose of preprocessing
------------------------
To generate or apply a PCA library, any source/science features, including
emission, absorption, continuum occurring within the data have to be ignored
otherwise they will be identified by the PCA as a feature and will result
in the loss of science flux during the reconstruction and subtraction
process. If the noise is proportional to the signal, this also has to be
accounted for by normalising this property away. Otherwise, places with more
flux, and therefore higher noise will be more strongly weighted by the PCA,
resulting in a biased reconstruction that priorities these features.
Therefore in this case, the data needs to be normalised by the noise to
negate this affect. For optical IFU data, the Poisson noise needs to be
removed before the PCA library, and the PCA reconstruction is run.
For LVM the preprocessing steps are therefore (in order):
1. Continuum removal
2. Normalisation by the Poisson noise
3. Skyline identification
4. Science line identification
5. Outlier identification
There are many ways to do the preprocessing steps, therefore this class
only provides the functionality to run these steps in a predefined way and
check which prerequisite steps have been performed. The actual functions to
perform this preprocessing need to be provided by the user for their
specific data. This allows the pipeline to be used for different
techniques, including previous SDSS implementations of the PCA routine and
easily allows different methods to be tested and used.
Information for LVM
---------------------
For LVM, untested functions for some of the preprocessing using skycorr
have been written and are set as the default if no other preprocessing
functions are provided. Skycorr is very different to what was done before,
and causes there to be no way to calculate the Poisson noise since a MODEL
is subtracted from the data, NOT an average sky spectrum. Since Poisson
noise is proportional to the square root of the photon counts, we can try
and estimate the Poisson noise added to the spectra by the sky flux by
assuming the skycorr model is a good representation of the flux; therefore
we can use the square root of the skycorr model that has been subtracted as
an estimate of the Poisson noise.
Final regards
-------------
Typically, this class will only be run by itself for testing. The analysis
classes run this class themselves so the user should not need to touch this
class. If the user does want to run it separately and then input it into the
analysis class, please limit the wavelength range to the part of the
spectrum that needs sky residuals removed. This can be done using the
convenience decorator function `mask_arrays_columns` as so:
#pp is the pcaPrep object
pp = mask_arrays_columns(wavelength_mask)(pcaPrep)(
sky_spectra=sky_spectra, noise_gen_data=noise_gen_data,
observation_numbers=observation_numbers,
subtraction_methods=subtraction_methods, **kwargs
)
USAGE SCRAP NOTES
-----------------
`observation_numbers` as long as it is iterable, should be
what allows you to keep track of which observation the spectra
came from. Some reductions use the median Poisson noise per
observation to normalise the sky AND and science. Use a convention
that allows them to preserve this information for both
the sky and science reductions.
Needs to be iterable since later reduction steps remove bad spectra
using their indices.
TODO: CHECK NAN HANDLING WORKS
"""
def __init__(self, sky_spectra, noise_gen_data, subtraction_methods, \
observation_numbers=None, skyline_mask=None, error_spectra=None, \
autorun=False, noise_gen_is_error=False, verbose=True, \
method_kwargs=None):
"""
Parameters
----------
sky_spectra : numpy.ndarray
The final calibrated, sky subtracted spectra that contain sky
subtraction residuals.
noise_gen_data : numpy.ndarray
Data/spectra that will be used to (estimate/)get the Poisson
noise and normalise the spectra with.
observation_numbers : numpy.1darray
Iterable ID numbers of the spectra to preserve knowledge of
where they came from. Some analysis/prep requires keeping track
of the observation set etc.
subtraction_methods : dict
What sky subtraction methods to use, which are wrapped into
the pipeline.
skyline_mask : numpy.ndarray, optional
If known, the position of skylines to perform the PCA analysis
upon. The default is None.
error_spectra : numpy.ndarray, optional
If the physical error arrays of the spectra are different to the
array needed when normalising the spectra by its Poisson noise, add
the noise array using this variable (needed for skycorr). The
default is None.
autorun : bool or 2-list of strings, optional
Whether to automatically run the nominal version of this pipeline.
If True, assumes the data needs all the prep performed (continuum
removal, Poisson noise normalisation, skyline identification,
science line masking and outlier removal in that order). If the
start or end point of the prep pipeline can be skipped, the user
defined starting and end point can be provided and the pipeline
will run from these steps in order. The order and method names are:
`'continuum'`
`'normalise'`
`'skylines'`
`'science_lines'`
`'outliers'`
To run for example, from normalise to the end, you can provide
either just provide the string `'normalise'`, or a list
like: `['normalise', None]` or `['normalise', 'outliers']`
or `['normalise', '']` or `['normalise', True]`
To limit the just end point to science line identification
(`'science_lines'`),provide the list:
`[None, 'science_lines']` or `['', 'science_lines']` or
`[True, 'science_lines']`. If a different start and end want to
be automatically run, provide their names are strings in a 2-list,
for example `['normalise', science_lines]`. If `False` is provided
within the 2-list, and the other option is a method, only that
single method is run. This can also be achieved by inputting the
method name twice such as ['outliers', 'outliers'] to run just the
outlier method. This functionality is best access using the
`get_'method' methods (i.e., to run just the skyline
identification) initialised the class with no autorun and call the
method `this_object.get_skylines()`. The default is False.
noise_gen_is_error : bool, optional
if the error error is the same as the data needed to normalise
the spectra using the Poisson noise, set this to True, else
leave as default or as False. The default is False.
verbose : bool, optional
If True, prints out updates as the class runs its methods.
The default is True.
TODO: Currently few print statements or logs, or diagnostic
plot have been coded.
method_kwargs : dict or None, optional
If additional variables are needed when running the preprocessing,
they can be provided as a dictionary containing these
variables as a dictionary. For example, if `get_science_lines` and
'get_outliers' needs additional variables (e.g. an array containing
all known optical lines to help with science line identification,
for get_science_lines, or to use non-default option in the
outlier method provided, such as sigma=5, MAD=True)
provide it as:
method_kwargs = {
'science_lines' : {
'all_optical':`YOUR_ARRAY`
},
'outliers' : {
'sigma' : 5
'MAD' : True
}
}
The default is None.
Returns
-------
None.
"""
#noise_gen_data usually just error_spectra, but some reductions (i.e.,
#skycorr uses a different data set for generating the noise so
#they are defined as two different variables.
sky_spectra, error_spectra = util.nan_inf_masking(sky_spectra, error_spectra)
self.prep_functions = subtraction_methods
self.sky_spectra = sky_spectra
self.noise_gen_data = noise_gen_data
self.observation_numbers = observation_numbers
self.skyline_mask = skyline_mask
self.error_spectra = error_spectra
self.noise_gen_is_error = noise_gen_is_error
self.verbose = verbose
self.method_kwargs = method_kwargs
self.processed_spectra = {}
self._start = 'continuum'
self._end = 'outliers'
self.autorun(autorun)
def get_continuum(self, full=False, **kwargs):
"""Method runs the continuum function that has been provided. Inputs
multiple arrays to accommodate any potential continuum removal
methods. Median filter just requires the data and filter size
Parameters
----------
full : bool, optional
If False, only the continuum subtracted data is returned. If True,
both the continuum subtracted and the found continuum are returned.
The default is False.
kwargs :
Any additional variables needed for the provided continuum
subtraction function.
Returns
-------
contiunuum_subtracted_spectra : numpy.ndarray
Data that has been continuum subtracted.
Optional
--------
continuum : numpy.ndarray
The continuum found is returned if `full` is True
"""
contiunuum_func = self.prep_functions['continuum']
continuum, contiunuum_subtracted_spectra = contiunuum_func(
self.sky_spectra,
noise_gen_data=self.noise_gen_data,
error_spectra=self.error_spectra,
observation_numbers=self.observation_numbers,
**kwargs
)
if full:
return contiunuum_subtracted_spectra, continuum
else:
return contiunuum_subtracted_spectra
def get_normalise(self, full=False, **kwargs):
"""Method runs the normalisation if need using the function that has
been provided.
Parameters
----------
full : bool, optional
If False, only the normalised data is returned. If True,
both the normalised data, and the data used to normalise are
returned. The default is False.
kwargs :
Any additional variables needed for the provided normalisation
function.
Returns
-------
normed_sky_spectra : numpy.ndarray
Data that has been normalised.
Optional
--------
poisson_noise : numpy.ndarray
The normalisation used is returned if `full` is True
"""
norm_func = self.prep_functions['normalise']
#uses the most recent data product (so if data needed to be continuum
#subtracted, that data is pulled into this method)
spectra = self._get_previous_reduced_data('normalise')
output = norm_func(
spectra=spectra,
noise_gen_data=self.noise_gen_data,
error_spectra=self.error_spectra,
observation_numbers=self.observation_numbers,
**kwargs)
#want the option to output if the data used to generate to the norm
#is the errors or not. This is a messy way of checking for this later
#since this allows us to reuse this class to prep the science data
#but has the prerequisite of needing the error spectra whereas
#the library generation does not after the sky has been normalised
#here
if isinstance(output, tuple):
if len(output)==3:
normed_sky_spectra, poisson_noise, self.noise_gen_is_error = output
else:
normed_sky_spectra, poisson_noise = output
if full:
return normed_sky_spectra, poisson_noise
else:
return normed_sky_spectra
def get_skylines(self, **kwargs):
"""Gets the skyline mask using the method provided.
NOTE: Skycorr automatically creates a skyline mask that we can use.
If this is the case, basically just provide a dummy function or the
skyline mask when initialising the class under the attribute
`skyline_mask`.
If master skys are being used, rather than using skycorr, then skylines
need to be found using an algorithm/function.
Parameters
----------
kwargs :
Any additional variables needed for the provided skyline
identification function.
Returns
-------
mask : numpy.1darray
1d bitmap mask (functionality also allows bool maps I believe)
where 1's (True) identify where there is a skyline, 0's (False)
are not a skyline. Since we only run PCA where we expect skylines
the masks is 1's/True at these locations
"""
if self.skyline_mask is not None:
return self.skyline_mask
else:
skyline_func = self.prep_functions['skylines']
spectra = self._get_previous_reduced_data('skylines')
mask = skyline_func(spectra=spectra,
noise_gen_data=self.noise_gen_data,
error_spectra=self.error_spectra,
observation_numbers=self.observation_numbers,
skyline_mask=self.skyline_mask,
**kwargs)
return mask # we want skylines so the mask here is 1's/True for skyline, 0/False otherwise
def get_science_lines(self, mask=None, **kwargs):
"""Method for finding and masking science lines
Parameters
----------
mask : numpy.1darray, optional
Mask of data that we know already are not science lines (such as
skylines) that we want to ignore when finding science lines. 1's
(True) indicate there is a skyline there, 0's (False) indicate
it does not contain a skyline
The default is None.
kwargs :
Any additional variables needed for the provided science line
identification function.
Returns
-------
science_line_masks : numpy.ndarray
Bitmap mask (functionality also allows bool maps I believe)
where 1's (True) identify data we want to use, 0's (False) are
science lines we need to ignore.
"""
if mask is None:
# if 'skylines' not in self.processed_spectra:
# mask =
mask = self._run_prerequisite_method('skylines')
spectra = self._get_spectra(config.prep_names['normalise'])#self._get_spectra(prep_names['continuum'])
science_lines_func = self.prep_functions['science_lines']
science_line_masks = science_lines_func(spectra=spectra,
skyline_mask=mask,
error_spectra=self.error_spectra,
observation_numbers=self.observation_numbers,
**kwargs)
# zeros/False indicate we don't want to use this data, so we exclude zeros/false and
#keep ones/True
return science_line_masks
def get_outliers(self, mask=None, science_line_masks=None, **kwargs):
"""Identifies where an entire spectrum is statistically an outlier
and masks the entire spectrum.
NOTE: We do not mask individual outliers since these are part
of the noise statistics that the PCA needs to make the Eigen-spectra.
What we need to mask are spectra that overall look bad and need to
be excluded from further analysis.
Parameters
----------
mask : numpy.ndarray, optional
Mask of data that we know already to ignore (such as
skylines) when finding outlier spectra. 1's (True) are skyline
locations. The default is None.
science_line_masks : numpy.ndarray, optional
Mask of data that we know already are science lines (such as
skylines) that we want to ignore when finding outlier spectra.
The default is None.
kwargs :
Any additional variables needed for the provided outlier
identification function.
Returns
-------
outlier_spectra_mask : numpy.ndarray
1d (can be 2d if the entire spectra has been masked)
indicating entire spectra to ignore (0's/False)
and which spectra to use (1's/True).
"""
if mask is None:
# if 'skylines' not in self.processed_spectra:
mask = self._run_prerequisite_method('skylines')
if science_line_masks is None:
# if 'science_lines' not in self.processed_spectra:
science_line_masks = self._run_prerequisite_method('science_lines')
spectra = self._get_spectra(config.prep_names['normalise'])#self._get_spectra(prep_names['continuum'])
outlier_func = self.prep_functions['outliers']
# zeros/False indicate we don't want to use this data, so we exclude
# zeros/false an keep ones/True
outlier_spectra_mask = outlier_func(spectra=spectra,
skyline_mask=mask,
science_line_mask=science_line_masks,
error_spectra=self.error_spectra,
observation_numbers=self.observation_numbers,
**kwargs)
# This is a 1d array for the rows we want to exclude from analysis
# (zeros/False) and include (ones). But with how I've set the
#future functions up, this can be a 2darray?
return outlier_spectra_mask
def run(self, start=None, end=None, method_kwargs=None):
"""Convenience function to run the default class methods while
defining the start and stop of the methods to run. These methods
are (in order):
`'continuum'`
`'normalise'`
`'skylines'`
`'science_lines'`
`'outliers'`
Parameters
----------
start : string, optional
Method to begin running. The default is None, which means
self._start will be called which is currently set to `'continuum'`.
end : string, optional
Method to run until. The default is None, which means
self._end will be called which is currently set to `'outliers'`.
method_kwargs : dict or None
If additional variables are needed when running the preprocessing,
they can be provided as a dictionary containing these
variables as a dictionary. For example, if `get_science_lines` and
'get_outliers' needs additional variables (e.g. an array containing
all known optical lines to help with science line identification,
for get_science_lines, or to use non-default option in the
outlier method provided, such as sigma=5, MAD=True)
provide it as:
method_kwargs = {
'science_lines' : {
'all_optical':`YOUR_ARRAY`
},
'outliers' : {
'sigma' : 5
'MAD' : True
}
}
Returns
-------
None.
"""
if method_kwargs is None:
if self.method_kwargs is None:
method_kwargs = {}
else:
method_kwargs = self.method_kwargs
#adds flexibility if slightly different formats are entered
start, end = self._run(start, end)
#This makes sure that the method strings entered exist. If they do not
#a key error is thrown here and is easy to identify
start_int = config.prep_names[start]
end_int = config.prep_names[end]
for i in range(start_int, end_int+1):
method = config.prep_steps[i] #dict
kwargs = method_kwargs.pop(method, {})
# kwargs = {} if kwargs is None else kwargs
self.processed_spectra[method] = getattr(self, 'get_' + method)(**kwargs) #(*(label, pointings))
def _run(self, start, end):
"""Takes in various inputs of which method to run and outputs their
string equivalent. `None`, `''`, `True` all result in the default
string being used. For `start`, this is `self._start`, which should be
`'continuum'`, for `end`, this should be `'outliers'`
If `False` is provided for start or end
within the 2-list, and the other option is a method, only that
single method is run. This can also be achieved by inputting the
method name twice such as ['outliers', 'outliers'] to run just the
outlier method. This functionality is best access using the
`get_'method' methods (i.e., to run just the skyline
identification) initialised the class with no autorun and call the
method `this_object.get_skylines()`.
Parameters
----------
start : string
Method to begin running.
end : string
Method to run until.
Returns
-------
start : string
Method to begin running.
end : string
Method to run until.
"""
start_str = end if start is False else start
end_str = start if end is False else end
choose_default_method_options = ['', None, True]
start = start_str if start_str not in choose_default_method_options else self._start
end = end_str if end_str not in choose_default_method_options else self._end
return start, end
def autorun(self, autorun):
"""If autorun has been set at class initialisation, class
will automatically run the analysis. If the start and/or end
have been provided when initialising, these will be used, else
it automatically runs all the prep methods in order. These are:
`continuum`
`normalise`
`skylines`
`science_lines`
`outliers`
Parameters
----------
autorun : None or 2-list
If provided `autorun` contains a list where to start and stop
the analysis. If `''`, `None` or `True` provided in the 2-list,
the default start/end are ran. If `False` is provided within the
2-list, and the other option is a method, only that single method
is run, though this functionality is best access using the
`get_'method' methods (i.e., to run just the skyline identification)
initialised the class with no autorun and call the method
`this_object.get_skylines()`
Returns
-------
None.
"""
if autorun == True:
self.run()
# self.run(self._start, self._end)
elif autorun is None or autorun == '' or autorun == False:
pass
elif all([not x for x in autorun]):
pass
else:
start, end = self._autorun(autorun)
self.run(start, end)
def _autorun(self, autorun):
"""For more varied inputs that have obvious meanings, this method
provides a decision tree to interpret various inputs and
provide outputs than can be recognised later on when running
the prep method.
Parameters
----------
autorun : None or 2-list
If provided `autorun` contains a list where to start and stop
the analysis. If `''`, `None` or `True` provided in the 2-list,
the default start/end are ran. If `False` is provided within the
2-list, and the other option is a method, only that single method
is run, though this functionality is best access using the
`get_'method' methods (i.e., to run just the skyline identification)
initialised the class with no autorun and call the method
`this_object.get_skylines()`
Returns
-------
start : str, bool or None
start method
end : str, bool or None
end method
"""
if autorun == True:
# if isinstance(autorun, bool):
self.run()
else:
if len(autorun) > 2:
raise IndexError(
'More than two methods were given. Provide either' \
' a single method to indicate the start, or two to' \
' indicate the start and stop.')
elif len(autorun) == 1:
# if run through autorun, this never triggers
if autorun[0] == False:
return None
elif isinstance(autorun[0], str) or autorun[0] == True:
start = autorun[0]
end = 'outliers'
else:
start = autorun[0]
end = autorun[1]
return start, end
def _get_previous_reduced_data(self, current_process):
"""Wrapper that finds the last run process and returns that data if there.
Parameters
----------
current_process : string
String indicating the current method being run
Returns
-------
data : numpy.ndarray
The data preprocessing that occurred chronologically before
the method entered as a string as the variable `current_process`
"""
current_step = config.prep_names[current_process]
previous_step = current_step-1
data = self._get_spectra(previous_step)
return data
def _get_spectra(self, processing_step):
"""Method outputs the prepped data provided, checking if potential.
prerequisite data has been run, and if not, provides a warning
Parameters
----------
processing_step : string
String indicating the data that has been called to be retrieved.
Returns
-------
data_for_current_step : numpy.ndarray
Data for the method requested.
Warnings
--------
Raises a warning to the user if a preprocessing step is being run
but the data in the previous step does not exist. In some cases,
the relevant preprocessing steps have already been done to the data
outside of this pipeline, and so if this is the case, ignore this
warning
"""
processing_method = config.prep_steps[processing_step]
if processing_method in self.processed_spectra:
data_for_current_step = self.processed_spectra[processing_method]
else:
if self.verbose:
steps = ''
for n in range(processing_step-1, 0, -1):
steps += config.prep_steps[n] + ','
warnings.warn(processing_method + ' and the earlier reduction ' \
'steps, ' + steps + ' have not been run. If '\
'entered spectra have not had these steps '\
'performed already, PCA will give incorrect '\
'results.')
data_for_current_step = self.sky_spectra
return data_for_current_step
def _run_prerequisite_method(self, method, run_if_missing=False):