Source code for morpheus.classifier

# MIT License
# Copyright 2018 Ryan Hausen
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# ==============================================================================
"""An interface for interacting with Morpheus"""
import os
import time
import json
from subprocess import Popen
from typing import Iterable, List, Tuple, Callable, Dict, Union

import imageio
import numpy as np
import tensorflow.compat.v1 as tf
from astropy.io import fits
from matplotlib.colors import hsv_to_rgb
from scipy import ndimage as ndi
from skimage.feature import peak_local_max
from skimage.filters import sobel
from skimage.measure import regionprops
from skimage.morphology import watershed
from tqdm import tqdm

tf.disable_eager_execution()

import morpheus.core.helpers as helpers
import morpheus.core.model as model


[docs]class Classifier:
    """The primary interface for the use of Morpheus.

    Images can be classified by calling
    :py:meth:`~morpheus.classifier.Classifier.classify` and passing
    numpy arrays or string FITS file locations.

    After an image this this class offers some post processing functionality by
    generating segmentation maps using
    :py:meth:`~morpheus.classifier.Classifier.segmap_from_classified`, colorized
    morphological classifications using
    :py:meth:`~morpheus.classifier.Classifier.colorize_classification`, and
    generating catalogs using
    :py:meth:`~morpheus.classifier.Classifier.catalog_from_classified`.

    For more examples, see the `documentation <https://morpheus-astro.readthedocs.io/>`_.
    """

    __graph = None
    __session = None
    __X = tf.placeholder(tf.float32, shape=[None, 40, 40, 4])

[docs]    @staticmethod
    def classify(
        h: Union[np.ndarray, str] = None,
        j: Union[np.ndarray, str] = None,
        z: Union[np.ndarray, str] = None,
        v: Union[np.ndarray, str] = None,
        out_dir: str = None,
        batch_size: int = 1000,
        out_type: str = "rank_vote",
        gpus: List[int] = None,
        cpus: int = None,
        parallel_check_interval: float = 1,
    ) -> dict:
        """Generates per-pixel classifications from input images.

        Args:
            h (Union[np.ndarray, str]): The H band image or the path to it
            j (Union[np.ndarray, str]): The J band image or the path to it
            v (Union[np.ndarray, str]): The V band image or the path to it
            z (Union[np.ndarray, str]): The Z band image or the path to it
            out_dir (str): If provided, a directory to save the output to
            batch_size (int): The size of the batches to use when classifying the input
            out_type (str): The method by which to aggregate classifications
                            for a single pixel. Can be one of "rank_vote",
                            "mean_var", or "both"
            gpus (List[int]): The GPU ids to use for parallel classification
                              the ids can be found using ``nvidia-smi``
            cpus (int): The number of cpus to use for parallel classification.
            parallel_check_interval (float): If running a parallel job, how often
                                           to check on the running sub-processes
                                           in minutes.

        Returns:
            Dictionary containing the classification output for the given input

        Raises:
            ValueError if both gpus and cpus are given
            ValueError if mixed string and numpy arrays are given for h, j, v, z
            ValueError if h, j, v, or z are None
        """

        Classifier._variables_not_none(["h", "j", "v", "z"], [h, j, v, z])
        are_files = Classifier._valid_input_types_is_str(h, j, v, z)
        workers, is_gpu = Classifier._validate_parallel_params(gpus, cpus)

        if are_files:
            hduls, [h, j, v, z] = Classifier._parse_files(h, j, v, z)

            if out_dir is None:
                out_dir = "."
        else:
            hduls = []

        if len(workers) == 1:
            classified = Classifier._classify_arrays(
                h=h,
                j=j,
                v=v,
                z=z,
                out_type=out_type,
                out_dir=out_dir,
                batch_size=batch_size,
            )
        else:
            Classifier._build_parallel_classification_structure(
                [h, j, v, z], workers, batch_size, out_dir, out_type
            )
            Classifier._run_parallel_jobs(
                workers, is_gpu, out_dir, parallel_check_interval
            )
            Classifier._stitch_parallel_classifications(workers, out_dir, out_type)

            classification_hduls, classified = Classifier._retrieve_classifications(
                out_dir, out_type
            )

            hduls.extend(classification_hduls)

        for hdul in hduls:
            hdul.close()

        return classified

[docs]    @staticmethod
    def catalog_from_classified(
        classified: dict,
        flux: np.ndarray,
        segmap: np.ndarray,
        aggregation_scheme: Callable = None,
        out_file: str = None,
    ) -> List[Dict]:
        """Creates a catalog of sources and their morphologies.

        Args:
            classified (dict): A dictionary containing the output from morpheus.
            flux (np.ndarray): The corresponding flux image in H band
            segmap (np.ndarray): A labeled segmap where every pixel with a
                          value > 0 is associated with a source.
            aggregation_scheme (func): Function that takes three arguments `classified`,
                                       `flux`, and `segmap`, same as this
                                       function, then returns a numpy array
                                       containing the morphological classification
                                       in the following order-spheroid, disk,
                                       irregular, and point source/compact. If
                                       None, then the flux weighting scheme
                                       in
            out_file (str): a location to save the catalog. Can either be .csv
                            or .json. Anything else will raise a ValueError.


        Returns:
            A JSON-compatible list of dictionary objects with the following keys:
            {
                'id': the id from the segmap
                'location': a (y,x) location -- the max pixel within the segmap
                'morphology': a dictionary containing the morphology values.
            }
        """

        if out_file:
            if out_file.endswith((".csv", ".json")):
                is_csv = out_file.endswith(".csv")
            else:
                raise ValueError("out_file must end with .csv or .json")

        if aggregation_scheme is None:
            aggregation_scheme = Classifier.aggregation_scheme_flux_weighted

        catalog = []

        for region in regionprops(segmap, flux):
            _id = region.label

            if _id < 1:
                continue

            img = region.intensity_image
            seg = region.filled_image

            start_y, start_x, end_y, end_x = region.bbox
            dat = {}
            for k in classified:
                dat[k] = classified[k][start_y:end_y, start_x:end_x].copy()

            classification = aggregation_scheme(dat, img, seg)

            masked_flux = img * seg

            # https://stackoverflow.com/a/3584260
            y, x = np.unravel_index(masked_flux.argmax(), masked_flux.shape)
            y, x = int(start_y + y), int(start_x + x)

            catalog.append(
                {"id": _id, "location": [y, x], "morphology": classification}
            )

        if out_file:
            with open(out_file, "w") as f:
                if is_csv:
                    f.write("source_id,y,x,sph,dsk,irr,ps\n")

                    for c in catalog:
                        csv = "{},{},{},{},{},{},{}\n"
                        f.write(
                            csv.format(
                                c["id"],
                                c["location"][0],
                                c["location"][1],
                                c["morphology"][0],
                                c["morphology"][1],
                                c["morphology"][2],
                                c["morphology"][3],
                            )
                        )
                else:
                    json.dump(catalog, f)

        return catalog

    # TODO: make the output file with the FITS helper if the output dir is used.
[docs]    @staticmethod
    def segmap_from_classified(
        classified: dict,
        flux: np.ndarray,
        bkg_src_threshold: float = 0.0,
        out_dir: str = None,
        min_distance: int = 20,
        mask: np.ndarray = None,
        deblend: bool = True,
    ) -> np.ndarray:
        """Generate a segmentation map from the classification output.

        For more information about the segmentation process, see:
        https://arxiv.org/abs/1906.11248

        Args:
            data (dict): A dictionary containing the output from morpheus.
            flux (np.ndarray): The flux to use when making the segmap
            bkg_src_threshold (float): The max value that a background
                                       classification pixel can take and be
                                       considered a source. The default is 0.
                                       Should be between [0,1]
            out_dir (str): A path to save the segmap in.
            min_distance (int): The minimum distance for deblending
            mask (np.ndarry): A boolean mask indicating which pixels
            deblend (bool): If ``True``, perform deblending as described in 2.
                            in the algorithm description. If ``False`` return
                            segmap without deblending.

        Returns:
            A np.ndarray segmentation map
        """
        if bkg_src_threshold < 0 or bkg_src_threshold >= 1:
            err_msg = [
                "Invalid value for `bkg_src_threshold`, use a value in the ",
                "range [0, 1)",
            ]

            raise ValueError(err_msg)

        bkg = classified["background"]
        markers = np.zeros_like(flux, dtype=np.uint8)

        print("Building Markers...")
        if mask is None:
            mask = classified["n"] > 0

        is_bkg = np.logical_and(bkg == 1, mask)
        is_src = np.logical_and(bkg <= bkg_src_threshold, mask)

        markers[is_bkg] = 1
        markers[is_src] = 2

        sobel_img = sobel(bkg)

        print("Watershedding...")
        segmented = watershed(sobel_img, markers, mask=mask) - 1
        segmented[np.logical_not(mask)] = 0

        labeled, _ = ndi.label(segmented)

        labeled[np.logical_not(mask)] = -1

        if deblend:
            labeled = Classifier._deblend(labeled, flux, min_distance)

        if out_dir:
            fits.PrimaryHDU(data=labeled).writeto(os.path.join(out_dir, "segmap.fits"))

        return labeled

[docs]    @staticmethod
    def colorize_classified(
        classified: dict, out_dir: str = None, hide_unclassified: bool = True
    ) -> np.ndarray:
        """Makes a color image from the classification output.

        The colorization scheme is defined in HSV and is as follows:

        * Spheroid = Red
        * Disk = Blue
        * Irregular = Green
        * Point Source = Yellow

        The hue is set to be the color associated with the highest ranked class
        for a given pixel. The saturation is set to be the difference between the
        highest ranked class and the second highest ranked class for a given
        pixel. For example, if the top two classes have nearly equal values given
        by the classifier, then the saturation will be low and the pixel will
        appear more white. If the top two classes have very different
        values, then the saturation will be high and the pixel's color will be
        vibrant and not white. The value for a pixel is set to be 1-bkg, where
        bkg is value given to the background class. If the background class has
        a high value, then the pixel will appear more black. If the background
        value is low, then the pixel will take on the color given by the hue and
        saturation values.

        Args:
            data (dict): A dictionary containing the output from Morpheus.
            out_dir (str): a path to save the image in.
            hide_unclassified (bool): If true, black out the edges of the image
                                      that are unclassified. If false, show the
                                      borders as white.

        Returns:
            A [width, height, 3] array representing the RGB image.
        """
        red = 0.0  # spheroid
        blue = 0.7  # disk
        yellow = 0.18  # point source
        green = 0.3  # irregular

        shape = classified["n"].shape

        colors = np.array([red, blue, green, yellow])
        morphs = np.dstack(
            [classified[i] for i in helpers.LabelHelper.MORPHOLOGIES[:-1]]
        )
        ordered = np.argsort(-morphs, axis=-1)

        hues = np.zeros(shape)
        sats = np.zeros(shape)
        vals = 1 - classified["background"]

        # the classifier doesn't return values for this area so black it out
        if hide_unclassified:
            vals[0:5, :] = 0
            vals[-5:, :] = 0
            vals[:, 0:5] = 0
            vals[:, -5:] = 0

        for i in tqdm(range(shape[0])):
            for j in range(shape[1]):
                hues[i, j] = colors[ordered[i, j, 0]]
                sats[i, j] = (
                    morphs[i, j, ordered[i, j, 0]] - morphs[i, j, ordered[i, j, 1]]
                )

        hsv = np.dstack([hues, sats, vals])
        rgb = hsv_to_rgb(hsv)

        if out_dir:
            png = (rgb * 255).astype(np.uint8)
            imageio.imwrite(os.path.join(out_dir, "colorized.png"), png)

        return rgb

    @staticmethod
    def _retrieve_classifications(
        out_dir: str, out_type: str
    ) -> Tuple[List[fits.HDUList], dict]:

        f_names = []
        for morph in helpers.LabelHelper.MORPHOLOGIES:
            if out_type in ["mean_var", "both"]:
                f_names.extend(
                    [
                        os.path.join(out_dir, f"{morph}_mean.fits"),
                        os.path.join(out_dir, f"{morph}_var.fits"),
                    ]
                )
            if out_type in ["rank_vote", "both"]:
                f_names.append(os.path.join(out_dir, f"{morph}.fits"))

        f_names.append(os.path.join(out_dir, "n.fits"))

        hduls, arrs = helpers.FitsHelper.get_files(f_names)

        classified = {
            os.path.split(n)[1].replace(".fits", ""): a for n, a in zip(f_names, arrs)
        }

        return hduls, classified

    @staticmethod
    def _valid_input_types_is_str(
        h: Union[np.ndarray, str] = None,
        j: Union[np.ndarray, str] = None,
        z: Union[np.ndarray, str] = None,
        v: Union[np.ndarray, str] = None,
    ):
        in_types = {type(val) for val in [h, j, z, v]}

        if len(in_types) > 1:
            raise ValueError(
                "Mixed input type usuage. Ensure all are numpy arrays or strings."
            )

        t = in_types.pop()

        if t in [np.ndarray, str]:
            return t == str
        else:
            raise ValueError("Input type must either be numpy array or string")

    # NEW API ==================================================================

    @staticmethod
    def _classify_arrays(
        h: np.ndarray = None,
        j: np.ndarray = None,
        z: np.ndarray = None,
        v: np.ndarray = None,
        out_dir: str = None,
        batch_size: int = 1000,
        out_type: str = "rank_vote",
    ) -> Dict:
        """Classify numpy arrays using Morpheus.

        Args:
            h (np.ndarray): the H band values for an image
            j (np.ndarray): the J band values for an image
            z (np.ndarray): the Z band values for an image
            v (np.ndarray): the V band values for an image
            out_dir (str): The location where to save the output files
                           if None returns the output in memory only.
            batch_size (int): the number of image sections blackto process at a time
            out_type (str): how to process the output from Morpheus. If
                            'mean_var' record output using mean and variance, If
                            'rank_vote' record output as the normalized vote
                            count. If 'both' record both outputs.

        Returns:
            A dictionary containing the output classifications.

        Raises:
            ValueError if out_type is not one of ['mean_var', 'rank_vote', 'both']
        """
        Classifier._variables_not_none(["h", "j", "z", "v"], [h, j, z, v])
        Classifier._arrays_same_size([h, j, z, v])

        if out_type not in ["mean_var", "rank_vote", "both"]:
            raise ValueError("Invalid value for `out_type`")

        mean_var = out_type in ["mean_var", "both"]
        rank_vote = out_type in ["rank_vote", "both"]

        shape = h.shape

        hduls = []
        data = {}
        if out_dir:
            if mean_var:
                hs, ds = helpers.FitsHelper.create_mean_var_files(shape, out_dir)
                hduls.extend(hs)
                data.update(ds)
            if rank_vote:
                hs, ds = helpers.FitsHelper.create_rank_vote_files(shape, out_dir)
                hduls.extend(hs)
                data.update(ds)

            hs, ds = helpers.FitsHelper.create_n_file(shape, out_dir)
            hduls.extend(hs)
            data.update(ds)
        else:
            if mean_var:
                data.update(helpers.LabelHelper.make_mean_var_arrays(shape))
            if rank_vote:
                data.update(helpers.LabelHelper.make_rank_vote_arrays(shape))

            data.update(helpers.LabelHelper.make_n_array(shape))

        indicies = helpers.LabelHelper.windowed_index_generator(*shape)

        window_y, window_x = helpers.LabelHelper.UPDATE_MASK_N.shape
        batch_estimate = shape[0] - window_y + 1
        batch_estimate *= shape[1] - window_x + 1
        batch_estimate = batch_estimate // batch_size
        pbar = tqdm(total=batch_estimate, desc="classifying", unit="batch")

        while True:
            batch = []
            batch_idx = []

            for _ in range(batch_size):
                try:
                    y, x = next(indicies)
                except StopIteration:
                    break

                combined = np.array(
                    [img[y : y + window_y, x : x + window_x] for img in [h, j, v, z]]
                )
                batch.append(Classifier._standardize_img(combined))
                batch_idx.append((y, x))

            if not batch:
                break

            batch = np.array(batch)

            labels = Classifier._call_morpheus(batch)

            helpers.LabelHelper.update_labels(data, labels, batch_idx, out_type)

            pbar.update()

        if rank_vote:
            helpers.LabelHelper.finalize_rank_vote(data)

        for hdul in hduls:
            hdul.close()

        return data

    @staticmethod
    def _standardize_img(img: np.ndarray) -> np.ndarray:
        """Standardizes an input img to mean 0 and unit variance.

        Uses the formula described in:

        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

        Args:
            img (np.ndarray): the input array to standardize

        Returns:
            The standardized input
        """
        num = img - img.mean()
        denom = max(img.std(), 1 / np.sqrt(np.prod(img.shape)))
        return num / denom

    @staticmethod
    def _arrays_same_size(arrays: List[np.ndarray]) -> None:
        """Verifies that all arrays are the same shape.

        Args:
            arrays (List[np.ndarray]): List of arrays that should have the same
                                       shape.

        Returns:
            None

        Raises:
            ValueError if arrays are not the same shape
        """

        arr_shapes = [a.shape for a in arrays]

        arr_comp = arr_shapes[0]
        arr_to_comp = arr_shapes[1:]

        if not np.array_equiv(arr_comp, arr_to_comp):
            raise ValueError(f"All shapes not the same: {arr_shapes}.")

    @staticmethod
    def _variables_not_none(names: List[str], values: List[np.ndarray]) -> None:
        """Verifies that all variables are not None.

        Args:
            names (List[str]): list of names of variables in the same order as
                               `values`
            names (List[np.ndarray]): list of numpy arrays that should not be
                                      None

        Returns:
            None

        Raises:
            ValueError if a variable is None

        """

        nones = []
        for name, value in zip(names, values):
            if value is None:
                nones.append(name)

        if nones:
            raise ValueError("{} should not be None".format(nones))

    @staticmethod
    def _parse_files(
        h: str, j: str, v: str, z: str
    ) -> Tuple[List[fits.HDUList], List[np.ndarray]]:
        """Validates that files exist. And returns the corresponding arrays.

        Args:
            h (str): the file location of the H band img
            j (str): the file location of the J band img
            v (str): the file location of the V band img
            z (str): the file location of the Z bnad img

        Returns:
            A tuple containing the a (List[HDUL], List[np.ndarray])

        Raises:
            ValueError if a variable is None

        """
        Classifier._variables_not_none(["h", "j", "z", "v"], [h, j, z, v])

        return helpers.FitsHelper.get_files([h, j, v, z])

    @staticmethod
    def _call_morpheus(batch: np.ndarray) -> np.ndarray:
        """Use morpheus to classify a batch of input values.

        Morpheus is called as a singleton using this method.

        Args:
            batch (np.ndarray): The input data in the shape
                                [batch, channels, width, height]

        Returns:
            The classified numpy array with shape [batch, width, height, channels]

        """
        batch = np.transpose(batch, axes=[0, 2, 3, 1])

        if Classifier.__graph is None:
            config = model.Morpheus.inference_hparams()
            inference_dataset = model.Morpheus.mock_dataset()

            # build graph
            m = model.Morpheus(config, inference_dataset, "channels_last")
            Classifier.__graph = m.inference(Classifier.__X)

            # get weights
            saver = tf.train.Saver()
            Classifier.__session = tf.Session()
            w_location = model.Morpheus.get_weights_dir()
            saver.restore(Classifier.__session, tf.train.latest_checkpoint(w_location))

        return Classifier.__session.run(
            Classifier.__graph, feed_dict={Classifier.__X: batch}
        )

    @staticmethod
    def _get_split_length(shape: List[int], num_workers: int) -> int:
        """Calculate the size of the sub images for classification.

        Args:
            shape (List[int]): the shape of the array to be split
            num_workers (int): the number of splits to make

        Returns:
            The length of each split along axis 0

        TODO: Implement splits along other axes
        """

        return (shape[0] + (num_workers - 1) * 40) // num_workers

    @staticmethod
    def _get_split_slice_generator(
        shape: Tuple[int], num_workers: int, slice_length: int
    ) -> Iterable[slice]:
        """Creates a generator that yields `slice` objects to split imgs.

        Args:
            shape (Tuple[int]): The shape of the array to be split
            num_workers (int): The number of splits to make
            split_length (int): The length each slice should be

        Returns
            A generator that yields slice objects

        TODO: Implement splits along other axes
        """

        idx = 0
        for i in range(num_workers):
            start_idx = max(idx - 39, 0)

            if i == num_workers - 1:
                end_idx = shape[0]
            else:
                end_idx = start_idx + slice_length - 1

            idx = end_idx

            yield slice(start_idx, end_idx)

    @staticmethod
    def _make_runnable_file(
        path: str, batch_size: int = 1000, out_type: str = "rank_vote"
    ) -> None:
        """Creates a file at `path` that classfies local FITS files.

        Args:
            path (str): The dir to save the file in
            batch_size (int): The batch size for Morpheus to use when classifying
                              the input
            out_type (str): how to process the output from Morpheus. If
                            'mean_var' record output using mean and variance, If
                            'rank_vote' record output as the normalized vote
                            count. If 'both' record both outputs.

        Returns:
            None
        """

        local = os.path.dirname(os.path.dirname(__file__))
        text = [
            "import sys",
            f'sys.path.append("{local}")',
            "import os",
            "import numpy as np",
            "from tqdm import tqdm",
            "from morpheus.classifier import Classifier",
            "def main():",
            "    data_dir = '.'",
            "    output_dir = './output'",
            "    if 'output' not in os.listdir():",
            "        os.mkdir('./output')",
            "    files = {",
            "        'h':os.path.join(data_dir, 'h.fits'),",
            "        'j':os.path.join(data_dir, 'j.fits'),",
            "        'v':os.path.join(data_dir, 'v.fits'),",
            "        'z':os.path.join(data_dir, 'z.fits')",
            "    }",
            "    Classifier.classify(h=files['h'],",
            "                        j=files['j'],",
            "                        v=files['v'],",
            "                        z=files['z'],",
            f"                       batch_size={batch_size},",
            f'                       out_type="{out_type}",',
            "                        out_dir=output_dir)",
            "    sys.exit(0)",
            "if __name__=='__main__':",
            "    main()",
        ]

        with open(os.path.join(path, "main.py"), "w") as f:
            f.write("\n".join(text))

    @staticmethod
    def _build_parallel_classification_structure(
        arrs: List[np.ndarray],
        workers: List[int],
        batch_size: int,
        out_dir: str,
        out_type: str,
    ) -> None:
        """Sets up the subdirs and files to run the parallel classification.

        Args:
            arrs (List[np.ndarray]): List of arrays to split up in the order HJVZ
            workers (List[int]): A list of worker ID's that can either be CUDA GPU
                                 ID's or a list dummy numbers for cpu workers
            batch_size (int): The batch size for Morpheus to use when classifying
                              the input.
            out_dir (str): the location to place the subdirs in

        Returns:
            None
        """

        shape = arrs[0].shape
        num_workers = len(workers)
        split_slices = Classifier._get_split_slice_generator(
            shape, num_workers, Classifier._get_split_length(shape, num_workers)
        )

        for worker, split_slice in tqdm(zip(sorted(workers), split_slices)):
            sub_output_dir = os.path.join(out_dir, str(worker))
            os.mkdir(sub_output_dir)

            for name, data in zip(["h", "j", "v", "z"], arrs):
                tmp_location = os.path.join(sub_output_dir, "{}.fits".format(name))
                fits.PrimaryHDU(data=data[split_slice, :]).writeto(tmp_location)

            Classifier._make_runnable_file(sub_output_dir, batch_size, out_type)

    @staticmethod
    def _stitch_parallel_classifications(
        workers: List[int], out_dir: str, out_type: str
    ) -> None:
        """Stitch the seperate outputs made from the parallel classifications.

        Args:
            workers (List[int]): A list of worker ID's that can either be CUDA GPU
                                 ID's or a list dummy numbers for cpu workers
            out_dir (str): the location that contains the parallel classified
                           subdirs
            out_type (str): how to process the output from Morpheus. If
                            'mean_var' record output using mean and variance, If
                            'rank_vote' record output as the normalized vote
                            count. If 'both' record both outputs.

        Returns:
            None
        """
        jobs = []
        if out_type in ["mean_var", "both"]:
            jobs.append("mean_var")
        if out_type in ["rank_vote", "both"]:
            jobs.append("rank_vote")

        for morph in helpers.LabelHelper.MORPHOLOGIES:
            for job in jobs:
                if job == "mean_var":
                    to_be_stitched = []
                    for worker_id in workers:  # each worker was assinged a dir by id
                        dir_list = [out_dir, str(worker_id), "output"]
                        f_mean = os.path.join(*(dir_list + [f"{morph}_mean.fits"]))
                        f_var = os.path.join(*(dir_list + [f"{morph}_var.fits"]))
                        f_n = os.path.join(*(dir_list + ["n.fits"]))

                        to_be_stitched.append(
                            (
                                fits.getdata(f_mean),
                                fits.getdata(f_var),
                                fits.getdata(f_n),
                            )
                        )

                    new_y = sum(t[0].shape[0] for t in to_be_stitched)
                    new_y -= 39 * (len(to_be_stitched) - 1)

                    new_x = to_be_stitched[0][0].shape[1]

                    combined_mean = np.zeros(shape=[new_y, new_x], dtype=np.float32)
                    combined_var = np.zeros(shape=[new_y, new_x], dtype=np.float32)
                    combined_n = np.zeros(shape=[new_y, new_x], dtype=np.float32)

                    start_y = 0
                    for new_mean, new_var, new_n in to_be_stitched:
                        Classifier._merge_parallel_means_vars(
                            combined_mean,
                            combined_var,
                            combined_n,
                            new_mean,
                            new_var,
                            new_n,
                            start_y,
                        )

                        start_y += new_n.shape[0] - 39

                    to_write = [
                        (combined_mean, f"{morph}_mean.fits"),
                        (combined_var, f"{morph}_var.fits"),
                        (combined_n, "n.fits"),
                    ]

                    for f, n in to_write:
                        fits.PrimaryHDU(data=f).writeto(
                            os.path.join(out_dir, n), overwrite=True
                        )

                if job == "rank_vote":
                    to_be_stitched = []
                    for worker_id in workers:  # each worker was assinged a dir by id
                        dir_list = [out_dir, str(worker_id), "output"]
                        f_votes = os.path.join(*(dir_list + [f"{morph}.fits"]))
                        f_n = os.path.join(*(dir_list + ["n.fits"]))

                        to_be_stitched.append(
                            (fits.getdata(f_votes), fits.getdata(f_n))
                        )

                    new_y = sum(t[0].shape[0] for t in to_be_stitched)
                    new_y -= 39 * (len(to_be_stitched) - 1)

                    new_x = to_be_stitched[0][0].shape[1]

                    combined_votes = np.zeros(shape=[new_y, new_x], dtype=np.float32)
                    combined_n = np.zeros(shape=[new_y, new_x], dtype=np.float32)

                    start_y = 0
                    for new_votes, new_n in to_be_stitched:
                        Classifier._merge_parallel_rank_votes(
                            combined_votes, combined_n, new_votes, new_n, start_y
                        )

                        start_y += new_n.shape[0] - 39

                    to_write = [
                        (combined_votes, f"{morph}.fits"),
                        (combined_n, "n.fits"),
                    ]

                    for f, n in to_write:
                        fits.PrimaryHDU(data=f).writeto(
                            os.path.join(out_dir, n), overwrite=True
                        )

    @staticmethod
    def _merge_parallel_means_vars(
        total_mean: np.ndarray,
        total_var: np.ndarray,
        total_n: np.ndarray,
        new_mean: np.ndarray,
        new_var: np.ndarray,
        new_n: np.ndarray,
        y_idx: int,
    ) -> None:
        """Merge merge means/vars from a new piece to total.

        Derived from:
        https://www.emathzone.com/tutorials/basic-statistics/combined-variance.html

        Args:
            total (np.ndarray): The array of means to add ``new`` to
            total_n (np.ndarray): The array of counts to add ``new_n`` to
            new (np.ndarray): the new means to add to ``total``
            new_n (np.ndarray): the new counts to add to ``total``
            y_idx (int): index for placement of ``new`` into ``total`` along y axis

        Returns:
            None
        """
        ys = slice(y_idx, y_idx + new_mean.shape[0])

        x1, x2 = total_mean[ys, :].copy(), new_mean.copy()
        s1, s2 = total_var[ys, :].copy(), new_var.copy()
        n1, n2 = total_n[ys, :].copy(), new_n.copy()

        denominator = n1 + n2

        xc_numerator = n1 * x1 + n2 * x2
        xc = np.where(denominator > 0, xc_numerator / denominator, 0)

        sc_numerator = (n1 * (s1 + np.square(x1 - xc))) + (
            n2 * (s2 + np.square(x2 - xc))
        )
        sc = np.where(denominator > 0, sc_numerator / denominator, 0)

        total_mean[ys, :] = xc
        total_var[ys, :] = sc
        total_n[ys, :] = denominator

    @staticmethod
    def _merge_parallel_rank_votes(
        total_votes: np.ndarray,
        total_n: np.ndarray,
        new_votes: np.ndarray,
        new_n: np.ndarray,
        y_idx: int,
    ) -> None:
        """Merge vote counts from a new piece to total

        Args:
            total_count (np.ndarray): The array of votes to add ``new`` to
            total_n (np.ndarray): The array of counts to add ``new_n`` to
            new_votes (np.ndarray): The array of votes to add to ``total``
            new_n (np.ndarray): The array of counts to add to ``new``
            y_idx (int): index for placement pf ``new`` into ``total`` along y axis

        Returns:
            None
        """
        ys = slice(y_idx, y_idx + new_votes.shape[0])

        x1, x2 = total_votes[ys, :].copy(), new_votes.copy()
        n1, n2 = total_n[ys, :].copy(), new_n.copy()

        numerator = (n1 * x1) + (n2 * x2)
        denominator = n1 + n2
        mean = np.where(denominator > 0, numerator / denominator, 0)

        total_votes[ys, :] = mean
        total_n[ys, :] = denominator

    # TODO: Add an informative output.
    @staticmethod
    def _run_parallel_jobs(
        workers: List[int], is_gpu: bool, out_dir: str, parallel_check_interval: float
    ) -> None:
        """Starts and tracks parallel job runs.

        WARNING: This will not finish running until all subprocesses are complete

        Args:
            workers (List[int]): A list of worker ID's to assign to a portion of an
                                 image.
            is_gpu (bool): if True the worker ID's belong to NVIDIA GPUs and will
                           be used as an argument in CUDA_VISIBLE_DEVICES. If False,
                           then the ID's are assocaited with CPU workers
            out_dir (str): the location with the partitioned data
            parallel_check_interval (float): If gpus are given, then this is the number
                                             of minutes to wait between polling each
                                             subprocess for completetion

        Returns:
            None
        """

        processes = {}

        for worker in workers:
            if is_gpu:
                cmd_string = f"CUDA_VISIBLE_DEVICES={worker} python main.py"
            else:
                cmd_string = f"CUDA_VISIBLE_DEVICES=-1 python main.py"

            sub_dir = os.path.join(out_dir, str(worker))
            processes[worker] = Popen(cmd_string, shell=True, cwd=sub_dir)

        is_running = np.ones([len(workers)], dtype=np.bool)
        while is_running.any():
            for i, g in enumerate(sorted(workers)):
                if is_running[i] and (processes[g].poll() is not None):
                    is_running[i] = False

            if is_running.any():
                time.sleep(parallel_check_interval * 60)
            else:  # we're done we can skip sleep
                break

    @staticmethod
    def _validate_parallel_params(
        gpus: List[int] = None, cpus: int = None
    ) -> Tuple[List[int], bool]:
        """Validates that the parallelism scheme.

        Only one of the arguments should be given.

        Args:
            gpus (List[int]): A list of the CUDA gpu ID's to use for a
                              parallel classification.
            cpus (int): Number of cpus to use foa a parallel classification

        Returns:
            A tuple containing the list of worker ids and a boolean indicating
            wheter or not the ids belong to GPUS

        Raises:
            ValueError if both cpus and gpus are not None
        """

        # invalid params
        if (gpus is not None) and (cpus is not None):
            raise ValueError("Please only give a value cpus or gpus, not both.")

        # Simple serial run
        if (gpus is None) and (cpus is None):
            return [0], False

        if gpus is not None:
            if len(gpus) == 1:
                err = "Only one gpus indicated. If you are trying to select "
                err += "a single gpu, then use the CUDA_VISIBLE_DEVICES environment "
                err += "variable. For more information visit: "
                err += "https://devblogs.nvidia.com/cuda-pro-tip-control-gpu-visibility-cuda_visible_devices/"

                raise ValueError(err)
            else:
                return gpus, True
        else:
            if cpus < 2:
                raise ValueError(
                    "If passing cpus please indicate a value greater than 1."
                )

            return np.arange(cpus), False

    @staticmethod
    def _deblend(segmap: np.ndarray, flux: np.ndarray, min_distance: int) -> np.ndarray:
        """Deblends a segmentation map according to the description in make_segmap.

        Args:
            segmap (np.ndarray): The segmentation map image to deblend
            flux (np.ndarray): The corresponding flux image in H band
            min_distance (int): The radius of the PSF for the instrument used on H band

        Returns:
            A np.ndarray representing the deblended segmap
        """

        max_id = segmap.max()

        for region in tqdm(regionprops(segmap, flux), desc="Deblending"):

            # greater than 1 indicates that the region is not background
            if region.label > 0:
                flx = region.intensity_image
                seg = region.filled_image
                flux_map = flx * seg

                maxes = peak_local_max(
                    flux_map, min_distance=min_distance, num_peaks=20
                )

                # more than 1 source found, deblend
                if maxes.shape[0] > 1:
                    start_y, start_x, end_y, end_x = region.bbox
                    markers = np.zeros_like(seg, dtype=np.int)

                    for y, x in maxes:
                        max_id += 1
                        markers[y, x] = max_id

                    deblended = watershed(-flux_map, markers, mask=seg)

                    local_segmap = segmap[start_y:end_y, start_x:end_x].copy()
                    local_segmap = np.where(seg, deblended, local_segmap)
                    segmap[start_y:end_y, start_x:end_x] = local_segmap

        return segmap

[docs]    @staticmethod
    def aggregation_scheme_flux_weighted(
        data: dict, flux: np.ndarray, segmap: np.ndarray
    ) -> List[float]:
        """Aggregates pixel level morphological classifications to the source level.

        Uses a flux-weighted mean of the pixel level morphologies to calculate
        the aggregate source level morphology.

        Args:
            data (dict): A dictionary containing the output from morpheus.
            flux (np.ndarray): The corresponding flux image in H band
            segmap (int): The binary map indicating pixels that belong to the
                          source

        Returns:
            The morphological classification as a list of floats in the
            following order: ['spheroid', 'disk', 'irregular', 'point source']
        """
        classifications = np.zeros([4])

        morphs = ["spheroid", "disk", "irregular", "point_source"]

        morphs = [data[m] for m in morphs]

        for i, m in enumerate(morphs):
            classifications[i] = np.mean(m[segmap] * flux[segmap])

        return (classifications / classifications.sum()).tolist()