Dataset 2

  • Please extract data.zip into the directory “data”

[1]:
# data_option = "dataset1"
data_option = "dataset2"

Initial setup

[2]:
from speciesot import configure_platform, Config, Data, SpeciesOT
[3]:
configure_platform()  # For macOS with Apple Silicon
# configure_platform("gpu")  # For Linux or WSL2 with an NVIDIA GPU
# configure_platform("cpu")  # For other platforms
JAX is configured to use: METAL

Computational parameters

[4]:
if data_option == "dataset1":
    mask_option = "time_series_data"
    threshold = 3.0
    threshold_surer = 3.5
    high_epsilon = 0.01

elif data_option == "dataset2":
    mask_option = "one_time_point_data"
    threshold = 1.4
    threshold_surer = 2.5
    high_epsilon = 0.1
[5]:
iterations = 1000
threshold_eps = 1e-4
low_epsilon = 0.0
threshold_tol = 3.0
[6]:
if data_option == "dataset1":
    species = ["human", "macaque", "mouse"]
    species_pairs = []
    species_labels = ["Human", "Macaque", "Mouse"]

elif data_option == "dataset2":
    species = ["human", "chimpanzee", "gorilla", "orangutan", "macaque", "mouse"]
    species_pairs = []
    species_labels = [
        "Human_iPSC(AK02)",
        "Chimp_iPSC(AK02)",
        "Gorilla_iPSC(AITS)",
        "Orang_iPSC(AITS)",
        "Macaque_ESC(AITS)",
        "Mouse_EpiLC",
    ]

Initialize the Config() class

[7]:
if data_option == "dataset1":
    config = Config(
        "dataset1",
        "drop",
        "distinct",
        "auto",
        "euclidean",
        "original",
        "fixed",  # "min" for exploring minimum converging varepsilon_min
        mask_option,
        iterations,
        threshold_eps,
        low_epsilon,
        high_epsilon,
        threshold_tol,
        threshold,
        threshold_surer,
        species=species,
        species_pairs=species_pairs,
        species_labels=species_labels,
    )

elif data_option == "dataset2":
    config = Config(
        "dataset2",
        "drop",
        "distinct",
        "auto",
        "euclidean",
        "original",
        "fixed",  # "min" for exploring minimum converging varepsilon_min
        mask_option,
        iterations,
        threshold_eps,
        low_epsilon,
        high_epsilon,
        threshold_tol,
        threshold,
        threshold_surer,
        species=species,
        species_pairs=species_pairs,
        species_labels=species_labels,
    )

Initialize the Data() class

[8]:
data = Data(config)

Read CSV file

[9]:
data = data.read_csv()

Geometrization steps (noise reduction and total count normalization)

[10]:
data = data.normalization()
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(16535), '#non-significant genes': np.int64(15612), '#silent genes': np.int64(0), 'ell': np.int64(65), 'Elapsed time': '0h 0m 8s 162ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(15569), '#non-significant genes': np.int64(12952), '#silent genes': np.int64(0), 'ell': np.int64(32), 'Elapsed time': '0h 0m 1s 708ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(15118), '#non-significant genes': np.int64(14736), '#silent genes': np.int64(0), 'ell': np.int64(66), 'Elapsed time': '0h 0m 10s 766ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(14588), '#non-significant genes': np.int64(14096), '#silent genes': np.int64(0), 'ell': np.int64(84), 'Elapsed time': '0h 0m 9s 622ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(14746), '#non-significant genes': np.int64(11152), '#silent genes': np.int64(0), 'ell': np.int64(119), 'Elapsed time': '0h 0m 10s 795ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(12010), '#non-significant genes': np.int64(9099), '#silent genes': np.int64(0), 'ell': np.int64(43), 'Elapsed time': '0h 0m 2s 299ms', 'solver': 'full'}

Geometrization step (gene selection using human transcription factors)

[11]:
data = data.read_tf()

Initialize the SpeciesOT() class

[12]:
spe_ot = SpeciesOT(data)

Geometrization step (filtering)

[13]:
spe_ot = spe_ot.preprocessing()

Geometrization step (distance matrix computation)

[14]:
spe_ot = spe_ot.calculate_gene_distance_matrix()

Entropically regularized Gromov-Wasserstein optimal transport

[15]:
spe_ot = spe_ot.gromov_wasserstein_ot()
Platform 'METAL' is experimental and not all JAX functionality may be correctly supported!
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
W0000 00:00:1762787318.943704 2416463 mps_client.cc:510] WARNING: JAX Apple GPU support is experimental and not all JAX functionality is correctly supported!
I0000 00:00:1762787318.954833 2416463 service.cc:145] XLA service 0x35624e1b0 initialized for platform METAL (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1762787318.954851 2416463 service.cc:153]   StreamExecutor device (0): Metal, <undefined>
I0000 00:00:1762787318.956100 2416463 mps_client.cc:406] Using Simple allocator.
I0000 00:00:1762787318.956111 2416463 mps_client.cc:384] XLA backend will use up to 103078739968 bytes on device 0 for SimpleAllocator.
Metal device set to: Apple M3 Max

systemMemory: 128.00 GB
maxCacheSize: 48.00 GB

epsilon = 0.1000000 converged

Normalized optimal transport plan

[16]:
spe_ot = spe_ot.normalize_otp()

Transcriptomic discrepancy

[17]:
spe_ot.plot_transcriptomic_discrepancy()
_images/tutorial2_31_0.png
_images/tutorial2_31_1.png

Chained Processing

[18]:
data2 = Data(config).read_csv().normalization().read_tf()
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(16535), '#non-significant genes': np.int64(15612), '#silent genes': np.int64(0), 'ell': np.int64(65), 'Elapsed time': '0h 0m 8s 811ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(15569), '#non-significant genes': np.int64(12952), '#silent genes': np.int64(0), 'ell': np.int64(32), 'Elapsed time': '0h 0m 1s 944ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(15118), '#non-significant genes': np.int64(14736), '#silent genes': np.int64(0), 'ell': np.int64(66), 'Elapsed time': '0h 0m 12s 065ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(14588), '#non-significant genes': np.int64(14096), '#silent genes': np.int64(0), 'ell': np.int64(84), 'Elapsed time': '0h 0m 9s 816ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(14746), '#non-significant genes': np.int64(11152), '#silent genes': np.int64(0), 'ell': np.int64(119), 'Elapsed time': '0h 0m 10s 696ms', 'solver': 'full'}
start RECODE for scRNA-seq data
end RECODE for scRNA-seq
log: {'seq_target': 'RNA', '#significant genes': np.int64(12010), '#non-significant genes': np.int64(9099), '#silent genes': np.int64(0), 'ell': np.int64(43), 'Elapsed time': '0h 0m 2s 218ms', 'solver': 'full'}
[19]:
spe_ot2 = (
    SpeciesOT(data2)
    .preprocessing()
    .calculate_gene_distance_matrix()
    .gromov_wasserstein_ot()
    .normalize_otp()
    .plot_transcriptomic_discrepancy()
)
epsilon = 0.1000000 converged
_images/tutorial2_34_1.png
_images/tutorial2_34_2.png

End of tutorial