## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE ) ## ----------------------------------------------------------------------------- # library(mLLMCelltype) # # results <- annotate_cell_types( # input, # Marker gene data (data frame, list, or file path) # tissue_name, # Tissue name (e.g., "human PBMC", "mouse brain") # model, # LLM model to use # api_key = NA, # API key (if not set in environment, NA returns prompt only) # top_gene_count = 10, # Number of top genes per cluster to use # debug = FALSE # Whether to print debugging information # ) ## ----eval=FALSE--------------------------------------------------------------- # consensus_results <- interactive_consensus_annotation( # input, # Original marker gene data (Seurat FindAllMarkers result or list of genes) # tissue_name = NULL, # Optional tissue name # models = c("claude-3-7-sonnet-20250219", "gpt-4o", "gemini-1.5-pro"), # Models to use # api_keys, # Named list of API keys # top_gene_count = 10, # Number of top genes to use # controversy_threshold = 0.7, # Threshold for identifying controversial clusters # entropy_threshold = 1.0, # Entropy threshold for controversial clusters # max_discussion_rounds = 3, # Maximum discussion rounds # consensus_check_model = NULL, # Model to use for consensus checking (see recommendations below) # log_dir = file.path(tempdir(), "mLLMCelltype_logs"), # Directory for logs (using tempdir) # cache_dir = file.path(tempdir(), "mLLMCelltype_cache"), # Directory for cache (using tempdir) # use_cache = TRUE # Whether to use cache # ) ## ----------------------------------------------------------------------------- # # Load example data # library(Seurat) # data("pbmc_small") # # # Find markers # pbmc_markers <- FindAllMarkers(pbmc_small, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # # # Run annotation with a single model # results <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = "claude-3-7-sonnet-20250219", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 10 # ) # # # Add annotations to Seurat object # pbmc_small$cell_type_claude <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = as.character(0:(length(results)-1)), # to = results # ) # # # Visualize # DimPlot(pbmc_small, group.by = "cell_type_claude", label = TRUE) ## ----------------------------------------------------------------------------- # # Define multiple models to use # models <- c( # "claude-3-7-sonnet-20250219", # Anthropic # "gpt-4o", # OpenAI # "gemini-1.5-pro", # Google # "grok-3" # X.AI # ) # # # API keys for different providers # api_keys <- list( # anthropic = Sys.getenv("ANTHROPIC_API_KEY"), # openai = Sys.getenv("OPENAI_API_KEY"), # gemini = Sys.getenv("GEMINI_API_KEY"), # grok = Sys.getenv("GROK_API_KEY") # ) # # # Run annotation with multiple models # results <- list() # for (model in models) { # provider <- get_provider(model) # api_key <- api_keys[[provider]] # # results[[model]] <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = model, # api_key = api_key, # top_gene_count = 10 # ) # } # # # Create consensus # consensus_results <- interactive_consensus_annotation( # input = pbmc_markers, # tissue_name = "human PBMC", # models = models, # Use all the models defined above # api_keys = api_keys, # controversy_threshold = 0.7, # entropy_threshold = 1.0, # consensus_check_model = "claude-3-7-sonnet-20250219" # ) # # # View consensus results # # You can access the final annotations with consensus_results$final_annotations # # # Add consensus annotations and metrics to Seurat object # pbmc_small$cell_type_consensus <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = as.character(0:(length(consensus_results$final_annotations)-1)), # to = consensus_results$final_annotations # ) # # # Extract consensus metrics from the consensus results # consensus_metrics <- lapply(names(consensus_results$initial_results$consensus_results), function(cluster_id) { # metrics <- consensus_results$initial_results$consensus_results[[cluster_id]] # return(list( # cluster = cluster_id, # consensus_proportion = metrics$consensus_proportion, # entropy = metrics$entropy # )) # }) # # # Convert to data frame for easier handling # metrics_df <- do.call(rbind, lapply(consensus_metrics, data.frame)) # # # Add consensus proportion to Seurat object # pbmc_small$consensus_proportion <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = metrics_df$cluster, # to = metrics_df$consensus_proportion # ) # # # Add entropy to Seurat object # pbmc_small$shannon_entropy <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = metrics_df$cluster, # to = metrics_df$entropy # ) ## ----------------------------------------------------------------------------- # # Set OpenRouter API key # openrouter_api_key <- Sys.getenv("OPENROUTER_API_KEY") # # # Define free OpenRouter models to use # free_models <- c( # "meta-llama/llama-4-maverick:free", # Meta Llama 4 Maverick (free) # "nvidia/llama-3.1-nemotron-ultra-253b-v1:free", # NVIDIA Nemotron Ultra 253B (free) # "deepseek/deepseek-chat-v3-0324:free", # DeepSeek Chat v3 (free) # "microsoft/mai-ds-r1:free" # Microsoft MAI-DS-R1 (free) # ) # # # Run annotation with free OpenRouter models # free_results <- list() # for (model in free_models) { # free_results[[model]] <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = model, # OpenRouter models are automatically detected by format: 'provider/model-name:free' # api_key = openrouter_api_key, # top_gene_count = 10 # ) # } # # # Create consensus with free models # free_consensus_results <- interactive_consensus_annotation( # input = pbmc_markers, # tissue_name = "human PBMC", # models = free_models, # Use all the free models defined above # api_keys = list("openrouter" = openrouter_api_key), # controversy_threshold = 0.7, # entropy_threshold = 1.0, # consensus_check_model = "meta-llama/llama-4-maverick:free" # Use a free model for consensus checking # ) # # # View free model consensus results # # You can access the final annotations with free_consensus_results$final_annotations # # # Add free model consensus annotations to Seurat object # pbmc_small$free_model_consensus <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = as.character(0:(length(free_consensus_results$final_annotations)-1)), # to = free_consensus_results$final_annotations # ) # # # Compare paid vs. free model results # comparison <- data.frame( # cluster = as.character(0:(length(consensus_results$final_annotations)-1)), # paid_models = consensus_results$final_annotations, # free_models = free_consensus_results$final_annotations, # agreement = consensus_results$final_annotations == free_consensus_results$final_annotations # ) # print(comparison) ## ----------------------------------------------------------------------------- # # Save markers to CSV # write.csv(pbmc_markers, "pbmc_markers.csv", row.names = FALSE) # # # Run annotation using the CSV file # results <- annotate_cell_types( # input = "pbmc_markers.csv", # tissue_name = "human PBMC", # model = "claude-3-7-sonnet-20250219", # api_key = Sys.getenv("ANTHROPIC_API_KEY") # ) ## ----------------------------------------------------------------------------- # # Note: The annotate_cell_types function does not have built-in caching. # # If you need caching, you can implement it separately. # # # Run annotation # results <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = "claude-3-7-sonnet-20250219", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 10, # debug = FALSE # ) # # # If you need custom caching, you can implement it using your own cache manager # # This is just a conceptual example and not part of the actual package # # cache_manager <- YourCacheManager$new(cache_dir = file.path(tempdir(), "cache")) # # cache_manager$clear_cache() ## ----------------------------------------------------------------------------- # # Example of using a free model via OpenRouter # # First, set your OpenRouter API key # Sys.setenv(OPENROUTER_API_KEY = "your-openrouter-api-key") # # # Then use a free model with the :free suffix # free_model_results <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = "meta-llama/llama-4-maverick:free", # Note the :free suffix # api_key = Sys.getenv("OPENROUTER_API_KEY") # # No need to specify provider - it's automatically detected from the model name format # ) ## ----------------------------------------------------------------------------- # library(Seurat) # library(mLLMCelltype) # library(ggplot2) # # # Load data # data("pbmc_small") # # # Standard Seurat preprocessing # pbmc_small <- NormalizeData(pbmc_small) # pbmc_small <- FindVariableFeatures(pbmc_small) # pbmc_small <- ScaleData(pbmc_small) # pbmc_small <- RunPCA(pbmc_small) # pbmc_small <- FindNeighbors(pbmc_small) # pbmc_small <- FindClusters(pbmc_small, resolution = 0.5) # pbmc_small <- RunUMAP(pbmc_small, dims = 1:10) # # # Find markers for each cluster # pbmc_markers <- FindAllMarkers(pbmc_small, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) # # # Define models to use # models <- c( # "claude-3-7-sonnet-20250219", # "gpt-4o", # "gemini-1.5-pro" # ) # # # API keys # api_keys <- list( # anthropic = Sys.getenv("ANTHROPIC_API_KEY"), # openai = Sys.getenv("OPENAI_API_KEY"), # gemini = Sys.getenv("GEMINI_API_KEY") # ) # # # Run annotation with multiple models # results <- list() # for (model in models) { # provider <- get_provider(model) # api_key <- api_keys[[provider]] # # results[[model]] <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = model, # api_key = api_key, # top_gene_count = 10 # ) # # # Add individual model results to Seurat object # column_name <- paste0("cell_type_", gsub("[^a-zA-Z0-9]", "_", model)) # pbmc_small[[column_name]] <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = as.character(0:(length(results[[model]])-1)), # to = results[[model]] # ) # } # # # Create consensus # consensus_results <- interactive_consensus_annotation( # input = pbmc_markers, # tissue_name = "human PBMC", # models = models, # Use all the models defined above # api_keys = api_keys, # controversy_threshold = 0.7, # entropy_threshold = 1.0, # consensus_check_model = "claude-3-7-sonnet-20250219" # ) # # # Add consensus results to Seurat object # pbmc_small$cell_type_consensus <- plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = as.character(0:(length(consensus_results$final_annotations)-1)), # to = consensus_results$final_annotations # ) # # # Extract consensus metrics from the consensus results # consensus_metrics <- lapply(names(consensus_results$initial_results$consensus_results), function(cluster_id) { # metrics <- consensus_results$initial_results$consensus_results[[cluster_id]] # return(list( # cluster = cluster_id, # consensus_proportion = metrics$consensus_proportion, # entropy = metrics$entropy # )) # }) # # # Convert to data frame for easier handling # metrics_df <- do.call(rbind, lapply(consensus_metrics, data.frame)) # # # Add consensus proportion to Seurat object # pbmc_small$consensus_proportion <- as.numeric(plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = metrics_df$cluster, # to = metrics_df$consensus_proportion # )) # # # Add entropy to Seurat object # pbmc_small$shannon_entropy <- as.numeric(plyr::mapvalues( # x = as.character(Idents(pbmc_small)), # from = metrics_df$cluster, # to = metrics_df$entropy # )) # # # Visualize results # p1 <- DimPlot(pbmc_small, group.by = "cell_type_consensus", label = TRUE, repel = TRUE) + # ggtitle("Cell Type Annotations") + # theme(plot.title = element_text(hjust = 0.5)) # # p2 <- FeaturePlot(pbmc_small, features = "consensus_proportion", cols = c("yellow", "green", "blue")) + # ggtitle("Consensus Proportion") + # theme(plot.title = element_text(hjust = 0.5)) # # p3 <- FeaturePlot(pbmc_small, features = "shannon_entropy", cols = c("red", "orange")) + # ggtitle("Shannon Entropy") + # theme(plot.title = element_text(hjust = 0.5)) # # # Combine plots # p1 | p2 | p3 ## ----------------------------------------------------------------------------- # # Using more genes (better for well-characterized tissues) # results_more_genes <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = "claude-3-7-sonnet-20250219", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 20 # Using more genes # ) # # # Using fewer genes (better for noisy data) # results_fewer_genes <- annotate_cell_types( # input = pbmc_markers, # tissue_name = "human PBMC", # model = "claude-3-7-sonnet-20250219", # api_key = Sys.getenv("ANTHROPIC_API_KEY"), # top_gene_count = 5 # Using fewer genes # ) ## ----eval=FALSE--------------------------------------------------------------- # # Example of using interactive_consensus_annotation with different controversy thresholds # # Lower threshold (more clusters will be discussed) # consensus_results_low_threshold <- interactive_consensus_annotation( # input = pbmc_markers, # tissue_name = "human PBMC", # models = c("claude-3-7-sonnet-20250219", "gpt-4o", "gemini-2.0-flash"), # api_keys = list( # "anthropic" = Sys.getenv("ANTHROPIC_API_KEY"), # "openai" = Sys.getenv("OPENAI_API_KEY"), # "gemini" = Sys.getenv("GEMINI_API_KEY") # ), # controversy_threshold = 0.3 # Lower threshold - more clusters will be discussed # ) # # # Higher threshold (fewer clusters will be discussed) # consensus_results_high_threshold <- interactive_consensus_annotation( # input = pbmc_markers, # tissue_name = "human PBMC", # models = c("claude-3-7-sonnet-20250219", "gpt-4o", "gemini-2.0-flash"), # api_keys = list( # "anthropic" = Sys.getenv("ANTHROPIC_API_KEY"), # "openai" = Sys.getenv("OPENAI_API_KEY"), # "gemini" = Sys.getenv("GEMINI_API_KEY") # ), # controversy_threshold = 0.7 # Higher threshold - fewer clusters will be discussed # )