你好,现在我需要发一个关于玉米gs 的sci文章,主要逻辑是比较原始表型数据和通过系谱矫正和空间矫正...
Tạo vào: 3 tháng 2, 2026
Trả lời bằng GPT-5.2 Thinking bởi Chat01
Tạo vào: 3 tháng 2, 2026
Trả lời bằng GPT-5.2 Thinking bởi Chat01
你好,现在我需要发一个关于玉米gs 的sci文章,主要逻辑是比较原始表型数据和通过系谱矫正和空间矫正和系谱加空间矫正后的精度比较。请你给我一个完整的思路流程。具体数据分析要怎么做。(注意双重收缩、数据泄漏等问题)
上传的文件为我的表型数据。据此给出一个完整的生产及R脚本。脚本要足够健壮且快速。
下面给你一套可以直接写成 SCI 的完整逻辑 + 可落地的数据分析流程(重点处理:** 双重收缩**、数据泄漏、速度/健壮性),并附上基于你上传的表型文件(7×124 的行列坐标、470 个杂交组合、少量重复)可直接改路径就能跑的 R 生产脚本。
玉米杂交组合的田间表型受:
对训练表型进行:
你要比较的不是“随便修正一下”,而是同一套 CV 框架下的四种表型进入 GS 的方式:
你文中可以强调:RAW 的表型噪声最大;S 去掉空间噪声;P 借助家系结构提高遗传值估计;P+S 同时减少空间噪声并利用相关性。
错误做法:先用全体数据做空间/系谱校正得到一个“校正后表型”,再做 K 折 CV。
这会把测试集信息(空间趋势参数、系谱随机效应估计等)带入训练过程 → 精度虚高。
正确做法(你要用这个):
脚本里我已经按这个原则实现。
当你用 BLUP/EBV(已经 shrink) 当作 GS 的表型,再用 rrBLUP/GBLUP 做一次 shrink,会导致:
解决方案(二选一):
脚本里对 P / P+S 都默认用 deregress + weight(若提取不到 PEV 会自动降级并提示)。
你上传的表型表格特征:
说明:你只上传了表型文件,GS 还需要基因型矩阵(marker 0/1/2)或 G 矩阵。
脚本支持两种输入:
geno_file:csv/tsv,第一列 id,其余为 markers(0/1/2,允许缺失)- 或你自己提前算好的
G.rds
把路径改掉就能跑。
r## ========================================================= ## Maize GS: RAW vs Pedigree (P) vs Spatial (S) vs P+S ## - Leak-free CV (correction inside each fold) ## - Handle double shrinkage via deregression for A-BLUP ## - Fast data.table + parallel CV ## ========================================================= ## --------------------------- ## 0) Config ## --------------------------- pheno_file <- "2x2use_副本.xlsx" # 你的表型文件 pheno_sheet <- 1 ## 任选其一:基因型矩阵或G矩阵(二者都给则优先geno->现场算G/rrBLUP) geno_file <- NULL # e.g. "geno_matrix.csv" (id + markers) geno_sep <- "," # csv="," tsv="\t" G_file <- NULL # e.g. "G_matrix.rds" (rownames=ids) traits <- c("yield_ha") # 想分析哪些性状;下面会重命名 k_folds <- 5 n_repeats <- 20 seed <- 20260203 n_cores <- max(1, parallel::detectCores() - 1) ## 空间模型优先用 SpATS(快、稳),P+S 用“两步近似”(先去空间再A-BLUP) use_SpATS <- TRUE ## 输出目录 out_dir <- "GS_outputs" dir.create(out_dir, showWarnings = FALSE, recursive = TRUE) ## --------------------------- ## 1) Packages (auto-check) ## --------------------------- safe_require <- function(pkg){ if (!requireNamespace(pkg, quietly = TRUE)) { stop(sprintf("Package '%s' not installed. Please install it first.", pkg)) } } safe_require("data.table") safe_require("readxl") safe_require("Matrix") safe_require("rrBLUP") safe_require("AGHmatrix") safe_require("future") safe_require("future.apply") if (use_SpATS) safe_require("SpATS") safe_require("stats") library(data.table) ## 并行 future::plan(future::multisession, workers = n_cores) ## --------------------------- ## 2) Read & clean phenotype ## --------------------------- ph0 <- readxl::read_excel(pheno_file, sheet = pheno_sheet) ph <- as.data.table(ph0) ## 统一列名(避免中文/编码问题) ## 你文件里是:No id 母本 父本 排 列 株高 穗位 抽丝期 粒重 水份 收获面积 标准粒重14水分 公顷产量 setnames(ph, old = c("No","id","母本","父本","排","列","株高","穗位","抽丝期","粒重","水份","收获面积","标准粒重14水分","公顷产量"), new = c("plot_no","hyb","dam","sire","row","col","plant_h","ear_h","silk","grain_wt","moist","area","grain_wt14","yield_ha"), skip_absent = TRUE) ## 基础类型 ph[, `:=`( hyb = as.character(hyb), dam = as.character(dam), sire = as.character(sire), row = as.integer(row), col = as.integer(col) )] stopifnot(all(c("hyb","dam","sire","row","col") %in% names(ph))) ## --------------------------- ## 3) Build pedigree A matrix from (hyb <- dam + sire) ## --------------------------- build_A_from_cross <- function(dt){ ped_hyb <- unique(dt[, .(id = hyb, dam = dam, sire = sire)]) base_par <- unique(c(ped_hyb$dam, ped_hyb$sire)) ped_par <- data.table(id = base_par, dam = NA_character_, sire = NA_character_) ped_all <- unique(rbind(ped_par, ped_hyb), by = "id") ## order pedigree (AGHmatrix needs ordered pedigree sometimes) ped_all <- as.data.frame(ped_all) ped_all <- AGHmatrix::orderPed(ped_all) A <- AGHmatrix::Amatrix(ped_all, ploidy = 2) ## 数值稳健(避免非正定) A <- as.matrix(Matrix::nearPD(A, corr = FALSE)$mat) rownames(A) <- colnames(A) <- rownames(A) # keep list(A = A, ped = ped_all) } Aobj <- build_A_from_cross(ph) A <- Aobj$A ## 只保留有表型的hyb hyb_ids <- sort(unique(ph$hyb)) A_hyb <- A[hyb_ids, hyb_ids, drop = FALSE] ## --------------------------- ## 4) Load genotype / G (optional but needed for GS) ## --------------------------- read_geno_matrix <- function(file, sep = ","){ g <- fread(file, sep = sep, data.table = FALSE) stopifnot(ncol(g) > 2) ids <- as.character(g[[1]]) M <- as.matrix(g[, -1, drop = FALSE]) storage.mode(M) <- "numeric" rownames(M) <- ids M } impute_center <- function(M){ ## 均值填补 + 中心化(按列) cm <- colMeans(M, na.rm = TRUE) idx_na <- which(is.na(M), arr.ind = TRUE) if (nrow(idx_na) > 0) M[idx_na] <- cm[idx_na[,2]] M <- scale(M, center = TRUE, scale = FALSE) M } ## 返回:list(M, ids_common) load_predictor <- function(ph_ids, geno_file=NULL, geno_sep=",", G_file=NULL){ if (!is.null(geno_file)) { M <- read_geno_matrix(geno_file, geno_sep) ids_common <- intersect(ph_ids, rownames(M)) if (length(ids_common) < 30) stop("Too few common IDs between phenotype and genotype matrix.") M <- impute_center(M[ids_common, , drop = FALSE]) return(list(type="M", M=M, ids=ids_common)) } if (!is.null(G_file)) { G <- readRDS(G_file) ids_common <- intersect(ph_ids, rownames(G)) if (length(ids_common) < 30) stop("Too few common IDs between phenotype and G matrix.") G <- as.matrix(G[ids_common, ids_common, drop = FALSE]) G <- as.matrix(Matrix::nearPD(G, corr = FALSE)$mat) return(list(type="G", G=G, ids=ids_common)) } stop("For GS you must provide either geno_file (marker matrix) or G_file (relationship matrix).") } pred_obj <- load_predictor(hyb_ids, geno_file, geno_sep, G_file) ## --------------------------- ## 5) Spatial correction (S) using SpATS (fast & stable) ## Return adjusted genotype means (BLUE-like) WITHOUT leakage: ## - fit only on training plots ## --------------------------- fit_spats_get_means <- function(dt_train, trait){ ## SpATS需要row/col为数值,且建议排序 d <- copy(dt_train) d <- d[!is.na(get(trait))] d[, `:=`(row = as.numeric(row), col = as.numeric(col))] ## PSANOVA: 2D P-spline spatial surface m <- SpATS::SpATS( response = trait, spatial = ~ SpATS::PSANOVA(row, col, nseg = c(7, 30), degree = c(3,3), pord = c(2,2)), genotype = "hyb", genotype.as.random = FALSE, # 关键:输出更像BLUE,减少双重收缩风险 fixed = ~ 1, random = ~ 1, data = as.data.frame(d) ) ## 提取hyb的估计值(固定效应部分) ## SpATS对象中:m$coefficients$fixed 里含各水平系数(版本差异较大,做健壮处理) cf <- m$coefficients$fixed nms <- names(cf) ## 截距 b0 <- unname(cf[which(nms %in% c("(Intercept)","Intercept"))][1]) if (is.na(b0)) b0 <- cf[1] ## hyb系数名通常像 "hybXXX" 或 "hybXXX" ## 做一个稳健匹配:包含"hyb"且后面跟水平名 hyb_levels <- unique(d$hyb) est <- setNames(rep(NA_real_, length(hyb_levels)), hyb_levels) for (h in hyb_levels){ hit <- which(grepl(paste0("^hyb", h, "$"), nms) | grepl(paste0("hyb", h), nms)) if (length(hit) == 0) { ## 有的版本用 "hyb:h" 或 "hybH" hit <- which(grepl(h, nms) & grepl("hyb", nms)) } est[h] <- b0 + ifelse(length(hit) > 0, unname(cf[hit[1]]), 0) } data.table(hyb = names(est), y = as.numeric(est), w = 1.0) } ## --------------------------- ## 6) Pedigree correction (P) via A-BLUP on training only ## - We use rrBLUP::kin.blup with K=A (fast) ## - Then deregress to avoid double shrinkage ## --------------------------- fit_Ablup_deregress <- function(dt_train, trait, A_train){ d <- copy(dt_train)[!is.na(get(trait))] ## 组合均值作为观测(因为多数只有一次观测,这一步不会损失信息) ybar <- d[, .(y = mean(get(trait))), by = hyb] ids <- ybar$hyb K <- A_train[ids, ids, drop = FALSE] ## rrBLUP::kin.blup 需要 data.frame kb <- rrBLUP::kin.blup( data = as.data.frame(ybar), geno = "hyb", pheno = "y", K = K, GAUSS = FALSE ) ## 提取EBV与PEV (rrBLUP版本不同字段可能不同,做健壮处理) ebv <- kb$g if (is.null(ebv)) ebv <- kb$BLUP if (is.null(ebv)) stop("Cannot find EBV/BLUP in kin.blup output.") ebv <- ebv[ids] pev <- kb$PEV if (is.null(pev)) { ## 如果没有PEV,则无法严格去回归:降级为不去回归(仍可跑,但要在文中说明) warning("PEV not found in kin.blup; deregression skipped (may cause double-shrink).") return(data.table(hyb = ids, y = as.numeric(ebv), w = 1.0)) } pev <- pev[ids] ## 遗传方差(从输出里找;找不到就用样本方差近似) sigA2 <- kb$Vg if (is.null(sigA2) || is.na(sigA2)) sigA2 <- stats::var(as.numeric(ebv), na.rm = TRUE) r <- 1 - (pev / sigA2) r <- pmin(pmax(r, 1e-6), 0.999) # 防数值问题 y_drp <- as.numeric(ebv) / r w <- r / (1 - r) data.table(hyb = ids, y = y_drp, w = w) } ## --------------------------- ## 7) P+S: two-step approximation (leak-free) ## Step1: remove spatial trend (SpATS) on training plots ## Step2: A-BLUP on spatial-adjusted phenotype ## --------------------------- fit_PS_two_step <- function(dt_train, trait, A_train){ ## 先用SpATS拟合空间面(genotype不进模型或作为固定可都行) d <- copy(dt_train)[!is.na(get(trait))] d[, `:=`(row = as.numeric(row), col = as.numeric(col))] m <- SpATS::SpATS( response = trait, spatial = ~ SpATS::PSANOVA(row, col, nseg = c(7, 30), degree = c(3,3), pord = c(2,2)), genotype = NULL, # 只建空间面,避免把遗传信号吸走(更稳) fixed = ~ 1, random = ~ 1, data = as.data.frame(d) ) ## 取空间拟合值:fitted包含固定+空间 yhat <- as.numeric(stats::fitted(m)) y_adj <- d[[trait]] - yhat # 去空间后的残差表型(保留遗传差异+噪声) d_adj <- d[, .(hyb, y_adj)] ybar <- d_adj[, .(y = mean(y_adj)), by = hyb] ids <- ybar$hyb K <- A_train[ids, ids, drop = FALSE] kb <- rrBLUP::kin.blup( data = as.data.frame(ybar), geno = "hyb", pheno = "y", K = K, GAUSS = FALSE ) ebv <- kb$g if (is.null(ebv)) ebv <- kb$BLUP ebv <- ebv[ids] pev <- kb$PEV if (is.null(pev)) { warning("PEV not found in kin.blup (P+S); deregression skipped.") return(data.table(hyb = ids, y = as.numeric(ebv), w = 1.0)) } pev <- pev[ids] sigA2 <- kb$Vg if (is.null(sigA2) || is.na(sigA2)) sigA2 <- stats::var(as.numeric(ebv), na.rm = TRUE) r <- 1 - (pev / sigA2) r <- pmin(pmax(r, 1e-6), 0.999) y_drp <- as.numeric(ebv) / r w <- r / (1 - r) data.table(hyb = ids, y = y_drp, w = w) } ## --------------------------- ## 8) RAW: simple mean per hybrid (leak-free) ## --------------------------- fit_RAW_mean <- function(dt_train, trait){ d <- copy(dt_train)[!is.na(get(trait))] ybar <- d[, .(y = mean(get(trait))), by = hyb] ybar[, w := 1.0] ybar } ## --------------------------- ## 9) Genomic prediction (rrBLUP ridge) using marker matrix ## If only G is provided, use GBLUP via mixed.solve on K ## --------------------------- predict_rrblup <- function(train_tbl, test_ids, pred_obj){ ## align train_tbl <- train_tbl[hyb %in% pred_obj$ids] test_ids <- intersect(test_ids, pred_obj$ids) if (length(test_ids) < 5 || nrow(train_tbl) < 20) return(NULL) if (pred_obj$type == "M"){ M <- pred_obj$M Mtr <- M[train_tbl$hyb, , drop = FALSE] Mte <- M[test_ids, , drop = FALSE] ms <- rrBLUP::mixed.solve( y = train_tbl$y, Z = Mtr, weights = train_tbl$w, SE = FALSE ) u <- ms$u b <- as.numeric(ms$beta) pred <- as.numeric(Mte %*% u + b) return(data.table(hyb = test_ids, pred = pred)) } if (pred_obj$type == "G"){ ## GBLUP: y = 1b + Zu, u~N(0, G*sigma) ## rrBLUP::mixed.solve 支持K(u的协方差) G <- pred_obj$G ids_tr <- train_tbl$hyb ids_te <- test_ids Ktr <- G[ids_tr, ids_tr, drop = FALSE] Kte <- G[ids_te, ids_tr, drop = FALSE] ms <- rrBLUP::mixed.solve( y = train_tbl$y, K = Ktr, weights = train_tbl$w, SE = FALSE ) ## 训练集u_hat在ms$u;预测:u_te = Kte %*% Ktr^{-1} %*% u_tr (rrBLUP内部等价) ## mixed.solve 没直接给Kinv,这里用求解线性方程(小心数值) Ktr_pd <- as.matrix(Matrix::nearPD(Ktr, corr = FALSE)$mat) u_tr <- as.numeric(ms$u) u_te <- as.numeric(Kte %*% solve(Ktr_pd, u_tr)) b <- as.numeric(ms$beta) pred <- u_te + b return(data.table(hyb = ids_te, pred = pred)) } NULL } ## --------------------------- ## 10) CV runner (leak-free) ## --------------------------- make_folds <- function(ids, k, seed){ set.seed(seed) ids <- sample(ids) fold <- rep(1:k, length.out = length(ids)) data.table(hyb = ids, fold = fold) } eval_accuracy <- function(pred_dt, y_obs_dt){ m <- merge(pred_dt, y_obs_dt, by = "hyb", all = FALSE) if (nrow(m) < 5) return(list(r=NA_real_, rs=NA_real_)) r <- suppressWarnings(stats::cor(m$pred, m$y, method = "pearson")) rs <- suppressWarnings(stats::cor(m$pred, m$y, method = "spearman")) list(r=r, rs=rs) } run_one_method <- function(method, dt_train, trait, A_train){ if (method == "RAW") return(fit_RAW_mean(dt_train, trait)) if (method == "S") return(fit_spats_get_means(dt_train, trait)) if (method == "P") return(fit_Ablup_deregress(dt_train, trait, A_train)) if (method == "PS") return(fit_PS_two_step(dt_train, trait, A_train)) stop("Unknown method: ", method) } run_cv_trait <- function(ph, trait, methods = c("RAW","P","S","PS")){ ids_all <- sort(unique(ph$hyb)) ## 若有基因型/G,仅保留共同个体 ids_all <- intersect(ids_all, pred_obj$ids) res_list <- future.apply::future_lapply(1:n_repeats, function(rep_i){ fold_dt <- make_folds(ids_all, k_folds, seed + rep_i) rep_res <- rbindlist(lapply(1:k_folds, function(k){ test_ids <- fold_dt[fold == k, hyb] train_ids <- fold_dt[fold != k, hyb] dt_train <- ph[hyb %in% train_ids] dt_test <- ph[hyb %in% test_ids] ## 观测真值:测试集内部求均值(不参与训练,不算泄漏) y_obs <- dt_test[, .(y = mean(get(trait))), by = hyb] out_k <- rbindlist(lapply(methods, function(mtd){ ## 训练表型(校正只在训练集上完成) train_tbl <- tryCatch( run_one_method(mtd, dt_train, trait, A_hyb), error = function(e){ warning(sprintf("[Rep %d Fold %d %s] stage1 failed: %s", rep_i, k, mtd, e$message)) return(NULL) } ) if (is.null(train_tbl) || nrow(train_tbl) < 20) { return(data.table(method = mtd, rep = rep_i, fold = k, r = NA_real_, rs = NA_real_)) } ## 基因组预测 pred_dt <- tryCatch( predict_rrblup(train_tbl, test_ids, pred_obj), error = function(e){ warning(sprintf("[Rep %d Fold %d %s] GS failed: %s", rep_i, k, mtd, e$message)) return(NULL) } ) if (is.null(pred_dt)) { return(data.table(method = mtd, rep = rep_i, fold = k, r = NA_real_, rs = NA_real_)) } acc <- eval_accuracy(pred_dt, y_obs) data.table(method = mtd, rep = rep_i, fold = k, r = acc$r, rs = acc$rs) }), fill = TRUE) out_k }), fill = TRUE) rep_res }) res <- rbindlist(res_list, fill = TRUE) ## 汇总 sum1 <- res[, .( mean_r = mean(r, na.rm = TRUE), sd_r = sd(r, na.rm = TRUE), mean_rs = mean(rs, na.rm = TRUE), sd_rs = sd(rs, na.rm = TRUE), n_ok = sum(!is.na(r)) ), by = method][order(-mean_r)] list(detail = res, summary = sum1) } ## --------------------------- ## 11) Run & save ## --------------------------- all_outputs <- list() for (tr in traits){ message("Running trait: ", tr) out <- run_cv_trait(ph, tr, methods = c("RAW","P","S","PS")) all_outputs[[tr]] <- out fwrite(out$detail, file = file.path(out_dir, paste0("CV_detail_", tr, ".csv"))) fwrite(out$summary, file = file.path(out_dir, paste0("CV_summary_", tr, ".csv"))) saveRDS(out, file = file.path(out_dir, paste0("CV_", tr, ".rds"))) } message("Done. Results saved to: ", out_dir)
geno_file(marker 矩阵)或 G_file(关系矩阵 rds)yield_ha, plant_h, silk 等)如果你把基因型文件格式(列名/编码/缺失)也上传,我可以把脚本里的“读入+质控+快速计算G/MAF过滤/并行”部分进一步定制到你真实数据规模(几十万 SNP 也能跑得动),并把“P+S 联合一步模型(非近似)”也给你补上对应实现路径(sommer/asreml 版本各一套)。