Source code for transcriptic.jupyter.dataset

import warnings

from copy import deepcopy

import pandas as pd

from .common import _BaseObject
from .container import Container
from .dataobject import DataObject


[docs]class Dataset(_BaseObject): """ A Dataset object contains helper methods for accessing data related information Attributes ---------- id : str Dataset id name: str Dataset name data : DataFrame DataFrame of well-indexed data values. Note that associated metadata is found in attributes dictionary data_objects : list(DataObject) List of DataObject type attachments : dict(str, bytes) names and data of all attachments for the dataset container: Container Container object that was used for this dataset operation: str Operation used for generating the dataset data_type: str Data type of this dataset attributes: dict Master attributes dictionary connection: transcriptic.config.Connection Transcriptic Connection object associated with this specific object """ def __init__(self, data_id, attributes=None, connection=None): """ Initialize a Dataset by providing a data name/id. The attributes and connection parameters are generally not specified unless one wants to manually initialize the object. Parameters ---------- data_id: str Dataset name or id in string form attributes: Optional[dict] Attributes of the dataset connection: Optional[transcriptic.config.Connection] Connection context. The default context object will be used unless explicitly provided """ super(Dataset, self).__init__("dataset", data_id, attributes, connection) # TODO: Get BaseObject to handle dataset name self.name = self.attributes["title"] self.id = data_id # TODO: Consider more formally distinguishing between dataset types try: self.operation = self.attributes["instruction"]["operation"]["op"] except KeyError: self.operation = None try: self.container = Container( self.attributes["container"]["id"], attributes=self.attributes["container"], connection=connection, ) except KeyError as e: if "instruction" in self.attributes: warnings.warn(f"Missing key {e} when initializing dataset") self.container = None self.analysis_tool = self.attributes["analysis_tool"] self.analysis_tool_version = self.attributes["analysis_tool_version"] self.data_type = self.attributes["data_type"] self._raw_data = None self._data = pd.DataFrame() self._attachments = None self._data_objects = None @property def attachments(self): if not self._attachments: self._attachments = self.connection.attachments(data_id=self.id) return self._attachments @property def raw_data(self): if not self._raw_data: # Get all raw data self._raw_data = self.connection.dataset(data_id=self.id, key="*") return self._raw_data @property def data(self, key="*"): if self._data.empty: # Get all data initially (think about lazy loading in the future) try: self._data = pd.DataFrame(self.raw_data) except: raise RuntimeError( "Failed to cast data as DataFrame. Try using raw_data property " "instead." ) self._data.columns = [x.upper() for x in self._data.columns] if key == "*": return self._data else: return self._data[key]
[docs] def data_objects(self): if not self._data_objects: self._data_objects = DataObject.init_from_dataset_id(self.id) return self._data_objects
[docs] def cross_ref_aliquots(self): # Use the container.aliquots DataFrame as the base aliquot_data = deepcopy(self.container.aliquots) data_column = [] indices_without_data = [] # Print a warning if new column will overwrite existing column if "Aliquot Data" in aliquot_data.columns.values.tolist(): warnings.warn( "Column 'Aliquot Data' will be overwritten with data pulled from " "Dataset." ) # Look up data for every well index for index in aliquot_data.index: # Get humanized index humanized_index = self.container.container_type.humanize(int(index)) if humanized_index in self.data: # Use humanized index to get data for that well data_point = self.data.loc[0, humanized_index] else: # If no data for that well, use None instead data_point = None indices_without_data.append(humanized_index) # Append data point to list data_column.append(data_point) # Print a list of well indices that do not have corresponding data keys if len(indices_without_data) > 0: warnings.warn( "The following indices were not found as data keys: %s" % ", ".join(indices_without_data) ) # Add these data as a column to the DataFrame aliquot_data["Aliquot Data"] = data_column return aliquot_data
def _repr_html_(self): return """<iframe src="%s" frameborder="0" allowtransparency="true" \ style="height:400px; width:600px" seamless></iframe>""" % self.connection.get_route( "view_data", data_id=self.id )