# 线程数 MM2_THREADS=36 ST_THREADS=36 # 软件绝对路径(minim...
创建于:2025年12月9日
使用 GPT-5.1 Thinking 回答 Chat01
创建于:2025年12月9日
使用 GPT-5.1 Thinking 回答 Chat01
MM2_THREADS=36
ST_THREADS=36
SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools"
SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit"
REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta"
IN_FASTQ="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/fastq/Col-cotyledon-dor-pass-fail-sup.fastq"
OUTDIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2"
mkdir -p "${OUTDIR}"
SAMPLE="Col-coty-dorado"
BASE="{SAMPLE}-Col-CC-V2"
###############################################################################
###############################################################################
R1_PREFIX="{R1_PREFIX}.sam"
R1_BAM="{R1_PREFIX}-sorted.bam"
R1_UNMAPPED_BAM="{R1_PREFIX}-sorted.unmapped_read_ids.txt"
R1_UNMAPPED_IDS_CLEAN="{R1_PREFIX}.unmapped.fq.gz"
echo "[R1] minimap2 map-ont: {MM2_THREADS}
"{IN_FASTQ}" > "${R1_SAM}"
echo "[R1] SAM -> BAM & sort"
"{ST_THREADS} -b -o "{R1_SAM}"
"{ST_THREADS} -o "{R1_BAM}"
"{ST_THREADS} "${R1_SORTED_BAM}"
echo "[R1] 提取未比对 reads 的 ID (flag 4)"
"{ST_THREADS} -b -f 4 "{R1_UNMAPPED_BAM}"
"{ST_THREADS} "{R1_UNMAPPED_BAM}" | awk '{print 1}' > "${R1_UNMAPPED_IDS}"
tr -d '\r' < "0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}"
echo "[R1] 未比对 ID 行数:{R1_UNMAPPED_IDS_CLEAN}")"
echo "[R1] 使用 seqkit 从原始 FASTQ 抽取未比对 reads"
"{ST_THREADS} -f "{IN_FASTQ}" -o "${R1_UNMAPPED_FASTQ}"
echo "[R1] R1_unmapped FASTQ 统计:"
"{R1_UNMAPPED_FASTQ}"
echo "[R1] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam"
rm -f "{R1_BAM}"
###############################################################################
###############################################################################
R2_PREFIX="{R2_PREFIX}.sam"
R2_BAM="{R2_PREFIX}-sorted.bam"
R2_UNMAPPED_BAM="{R2_PREFIX}-sorted.unmapped_read_ids.txt"
R2_UNMAPPED_IDS_CLEAN="{R2_PREFIX}.unmapped.fq.gz"
echo "[R2] minimap2 asm20: {MM2_THREADS}
"{R1_UNMAPPED_FASTQ}" > "${R2_SAM}"
echo "[R2] SAM -> BAM & sort"
"{ST_THREADS} -b -o "{R2_SAM}"
"{ST_THREADS} -o "{R2_BAM}"
"{ST_THREADS} "${R2_SORTED_BAM}"
echo "[R2] 提取未比对 reads 的 ID (flag 4)"
"{ST_THREADS} -b -f 4 "{R2_UNMAPPED_BAM}"
"{ST_THREADS} "{R2_UNMAPPED_BAM}" | awk '{print 1}' > "${R2_UNMAPPED_IDS}"
tr -d '\r' < "0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}"
echo "[R2] 未比对 ID 行数:{R2_UNMAPPED_IDS_CLEAN}")"
echo "[R2] 使用 seqkit 从 R1_unmapped FASTQ 抽取 R2 未比对 reads"
"{ST_THREADS} -f "{R1_UNMAPPED_FASTQ}" -o "${R2_UNMAPPED_FASTQ}"
echo "[R2] R2_unmapped FASTQ 统计:"
"{R2_UNMAPPED_FASTQ}"
echo "[R2] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam"
rm -f "{R2_BAM}"
###############################################################################
###############################################################################
R3_PREFIX="{R3_PREFIX}.sam"
R3_BAM="{R3_PREFIX}-sorted.bam"
R3_UNMAPPED_BAM="{R3_PREFIX}-sorted.unmapped_read_ids.txt"
R3_UNMAPPED_IDS_CLEAN="{R3_PREFIX}.unmapped.fq.gz" # 最终三轮都未比对上的 reads
echo "[R3][planA] minimap2 map-ont with tuned params: {MM2_THREADS}
"{R2_UNMAPPED_FASTQ}" > "${R3_SAM}"
echo "[R3] SAM -> BAM & sort"
"{ST_THREADS} -b -o "{R3_SAM}"
"{ST_THREADS} -o "{R3_BAM}"
"{ST_THREADS} "${R3_SORTED_BAM}"
echo "[R3] 提取未比对 reads 的 ID (flag 4)"
"{ST_THREADS} -b -f 4 "{R3_UNMAPPED_BAM}"
"{ST_THREADS} "{R3_UNMAPPED_BAM}" | awk '{print 1}' > "${R3_UNMAPPED_IDS}"
tr -d '\r' < "0)>0' | LC_ALL=C sort -u > "${R3_UNMAPPED_IDS_CLEAN}"
echo "[R3] 未比对 ID 行数:{R3_UNMAPPED_IDS_CLEAN}")"
echo "[R3] 使用 seqkit 从 R2_unmapped FASTQ 抽取三轮都未比对上的 reads"
"{ST_THREADS} -f "{R2_UNMAPPED_FASTQ}" -o "${R3_UNMAPPED_FASTQ}"
echo "[R3] 最终未比对 FASTQ:{SEQKIT}" stats "${R3_UNMAPPED_FASTQ}"
echo "[R3] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam"
rm -f "{R3_BAM}"
echo "=== 全流程完成(map-ont → asm20 → planA-mapont;三轮均未比对的 reads 已输出)==="
echo "保留文件(每一轮):"
echo " * {R1_UNMAPPED_BAM}"
echo " * {R2_UNMAPPED_BAM}"
echo " * {R3_UNMAPPED_BAM}" 我们现在需要修改这个代码,调用的minimap2为/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2, 其余的软件位置不变 /DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq/Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastq \ >/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/Col-coty-sup-Chr1-SV-cs.sam 处理的文件名字以及风格如上,同时我们也处理Col-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastq 以及Col-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastq 结果名字命名风格类似,同时也经历三轮比对,但比对时候只保留--cs=long 去掉--MD 因为这两个tag冲突,请你根据这要求进行修改吧
我直接按你的要求把脚本改好:
/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2--cs=long,去掉 --MD/DataZhai/.../SV-test-fastq/Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastqminimap2-Col-CC-V2 目录SAMPLE 改成 Col-coty-sup-Chr1-SV,输出命名风格类似你举的例子,仍保留三轮比对对 Chr2、Chr3 只需要改两行:
IN_FASTQ和SAMPLE,下面我也写了示例。
bash#!/usr/bin/env bash set -euo pipefail ############################################################################### # 线程数 ############################################################################### MM2_THREADS=36 ST_THREADS=36 ############################################################################### # 软件绝对路径 ############################################################################### MINIMAP2="/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2" SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools" SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit" ############################################################################### # 参考基因组(Col-CC-V2) ############################################################################### REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta" ############################################################################### # 原始输入 FASTQ(此处示例为 Chr1 的 SV FASTQ) # Chr1: Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastq # Chr2: Col-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastq # Chr3: Col-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastq ############################################################################### IN_FASTQ="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq/Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastq" ############################################################################### # 输出目录 ############################################################################### OUTDIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2" mkdir -p "${OUTDIR}" ############################################################################### # sample 名,用来拼接输出文件前缀 # 这里示例:Col-coty-sup-Chr1-SV # 如果处理 Chr2/Chr3,建议改为: # Chr2: Col-coty-sup-Chr2-SV # Chr3: Col-coty-sup-Chr3-SV ############################################################################### SAMPLE="Col-coty-sup-Chr1-SV" BASE="${OUTDIR}/${SAMPLE}-Col-CC-V2" ############################################################################### # Round 1:minimap2 map-ont 对原始 FASTQ ############################################################################### R1_PREFIX="${BASE}-r1-mapont" R1_SAM="${R1_PREFIX}.sam" R1_BAM="${R1_PREFIX}.bam" R1_SORTED_BAM="${R1_PREFIX}-sorted.bam" R1_UNMAPPED_BAM="${R1_PREFIX}-sorted-unmapped.bam" R1_UNMAPPED_IDS="${R1_PREFIX}-sorted.unmapped_read_ids.txt" R1_UNMAPPED_IDS_CLEAN="${R1_PREFIX}-sorted.unmapped_read_ids.clean.txt" R1_UNMAPPED_FASTQ="${R1_PREFIX}.unmapped.fq.gz" echo "[R1] minimap2 map-ont: ${IN_FASTQ}" "${MINIMAP2}" -x map-ont -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${IN_FASTQ}" > "${R1_SAM}" echo "[R1] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R1_BAM}" "${R1_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R1_SORTED_BAM}" "${R1_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R1_SORTED_BAM}" echo "[R1] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R1_SORTED_BAM}" -o "${R1_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R1_UNMAPPED_BAM}" | awk '{print $1}' > "${R1_UNMAPPED_IDS}" tr -d '\r' < "${R1_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}" echo "[R1] 未比对 ID 行数:$(wc -l < "${R1_UNMAPPED_IDS_CLEAN}")" echo "[R1] 使用 seqkit 从原始 FASTQ 抽取未比对 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R1_UNMAPPED_IDS_CLEAN}" "${IN_FASTQ}" -o "${R1_UNMAPPED_FASTQ}" echo "[R1] R1_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R1_UNMAPPED_FASTQ}" # R1 只删除原始 SAM 和未排序 BAM echo "[R1] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R1_SAM}" "${R1_BAM}" ############################################################################### # Round 2:minimap2 asm20,对 R1 未比对 reads ############################################################################### R2_PREFIX="${BASE}-r2-asm20" R2_SAM="${R2_PREFIX}.sam" R2_BAM="${R2_PREFIX}.bam" R2_SORTED_BAM="${R2_PREFIX}-sorted.bam" R2_UNMAPPED_BAM="${R2_PREFIX}-sorted-unmapped.bam" R2_UNMAPPED_IDS="${R2_PREFIX}-sorted.unmapped_read_ids.txt" R2_UNMAPPED_IDS_CLEAN="${R2_PREFIX}-sorted.unmapped_read_ids.clean.txt" R2_UNMAPPED_FASTQ="${R2_PREFIX}.unmapped.fq.gz" echo "[R2] minimap2 asm20: ${R1_UNMAPPED_FASTQ}" "${MINIMAP2}" -x asm20 -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${R1_UNMAPPED_FASTQ}" > "${R2_SAM}" echo "[R2] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R2_BAM}" "${R2_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R2_SORTED_BAM}" "${R2_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R2_SORTED_BAM}" echo "[R2] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R2_SORTED_BAM}" -o "${R2_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R2_UNMAPPED_BAM}" | awk '{print $1}' > "${R2_UNMAPPED_IDS}" tr -d '\r' < "${R2_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}" echo "[R2] 未比对 ID 行数:$(wc -l < "${R2_UNMAPPED_IDS_CLEAN}")" echo "[R2] 使用 seqkit 从 R1_unmapped FASTQ 抽取 R2 未比对 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R2_UNMAPPED_IDS_CLEAN}" "${R1_UNMAPPED_FASTQ}" -o "${R2_UNMAPPED_FASTQ}" echo "[R2] R2_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R2_UNMAPPED_FASTQ}" # R2 只删除原始 SAM 和未排序 BAM echo "[R2] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R2_SAM}" "${R2_BAM}" ############################################################################### # Round 3:plan A(自定义 map-ont),对 R2 未比对 reads ############################################################################### R3_PREFIX="${BASE}-r3-planA-mapont" R3_SAM="${R3_PREFIX}.sam" R3_BAM="${R3_PREFIX}.bam" R3_SORTED_BAM="${R3_PREFIX}-sorted.bam" R3_UNMAPPED_BAM="${R3_PREFIX}-sorted-unmapped.bam" R3_UNMAPPED_IDS="${R3_PREFIX}-sorted.unmapped_read_ids.txt" R3_UNMAPPED_IDS_CLEAN="${R3_PREFIX}-sorted.unmapped_read_ids.clean.txt" # 最终三轮都未比对上的 reads R3_UNMAPPED_FASTQ="${R3_PREFIX}.unmapped.fq.gz" echo "[R3][planA] minimap2 map-ont with tuned params: ${R2_UNMAPPED_FASTQ}" "${MINIMAP2}" -x map-ont -a --cs=long \ -k13 -w7 \ -r2k,60k -g8k \ -U40,700 \ -s70 -z600,200 \ -N30 -t ${MM2_THREADS} \ "${REF}" "${R2_UNMAPPED_FASTQ}" > "${R3_SAM}" echo "[R3] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R3_BAM}" "${R3_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R3_SORTED_BAM}" "${R3_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R3_SORTED_BAM}" echo "[R3] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R3_SORTED_BAM}" -o "${R3_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R3_UNMAPPED_BAM}" | awk '{print $1}' > "${R3_UNMAPPED_IDS}" tr -d '\r' < "${R3_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R3_UNMAPPED_IDS_CLEAN}" echo "[R3] 未比对 ID 行数:$(wc -l < "${R3_UNMAPPED_IDS_CLEAN}")" echo "[R3] 使用 seqkit 从 R2_unmapped FASTQ 抽取三轮都未比对上的 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R3_UNMAPPED_IDS_CLEAN}" "${R2_UNMAPPED_FASTQ}" -o "${R3_UNMAPPED_FASTQ}" echo "[R3] 最终未比对 FASTQ:${R3_UNMAPPED_FASTQ}" "${SEQKIT}" stats "${R3_UNMAPPED_FASTQ}" # R3 只删除原始 SAM 和未排序 BAM echo "[R3] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R3_SAM}" "${R3_BAM}" echo "=== 全流程完成(map-ont → asm20 → planA-mapont;三轮均未比对的 reads 已输出)===" echo "保留文件(每一轮):" echo " * ${R1_SORTED_BAM} 及其 .bai" echo " * ${R1_UNMAPPED_BAM}" echo " * ${R2_SORTED_BAM} 及其 .bai" echo " * ${R2_UNMAPPED_BAM}" echo " * ${R3_SORTED_BAM} 及其 .bai" echo " * ${R3_UNMAPPED_BAM}"
对于 Chr2、Chr3,你可以复制这份脚本两份,分别改两行即可:
bashIN_FASTQ="/DataZhai/wangzh/2024_11_ATXR5/.../SV-test-fastq/Col-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastq" SAMPLE="Col-coty-sup-Chr2-SV"
bashIN_FASTQ="/DataZhai/wangzh/2024_11_ATXR5/.../SV-test-fastq/Col-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastq" SAMPLE="Col-coty-sup-Chr3-SV"
这样三份脚本分别跑 Chr1 / Chr2 / Chr3,都用新的 minimap2 路径,并且 只写 --cs=long 不再写 --MD,三轮比对逻辑保持不变。
如果你希望最后某一轮(比如 R1 或 R3)的 SAM 名字完全变成类似
Col-coty-sup-Chr1-SV-cs.sam,我也可以帮你再把前缀变量改成这个精确名字。
我希望你可以写一个循环,一次性依次处理三个文件,三个文件名字只有Chr的数字不同
可以的,我们把这三条 Chr 的 FASTQ 放到一个循环里,一次性按相同流程跑完三轮比对。
下面这个脚本会依次处理:
Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastqCol-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastqCol-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastq同时满足你之前所有要求:
/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2--cs=long,去掉 --MDSAMPLE="Col-coty-sup-Chr${CHR}-SV",输出前缀类似之前:Col-coty-sup-Chr1-SV-Col-CC-V2-r1-mapont.* 等bash#!/usr/bin/env bash set -euo pipefail ############################################################################### # 线程数 ############################################################################### MM2_THREADS=36 ST_THREADS=36 ############################################################################### # 软件绝对路径 ############################################################################### MINIMAP2="/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2" SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools" SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit" ############################################################################### # 参考基因组(Col-CC-V2) ############################################################################### REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta" ############################################################################### # 输入/输出目录 ############################################################################### SV_DIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq" OUTDIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2" mkdir -p "${OUTDIR}" ############################################################################### # 依次处理 Chr1 / Chr2 / Chr3 ############################################################################### for CHR in 1 2 3; do echo "==============================" echo " 开始处理 Chr${CHR}" echo "==============================" ########################################################################### # 当前染色体对应的输入 FASTQ & SAMPLE 名 # 输入:Col-coty-dorado-minimap2-Col-CC-V2-Chr${CHR}-SV.fastq # 输出前缀 SAMPLE:Col-coty-sup-Chr${CHR}-SV ########################################################################### IN_FASTQ="${SV_DIR}/Col-coty-dorado-minimap2-Col-CC-V2-Chr${CHR}-SV.fastq" SAMPLE="Col-coty-sup-Chr${CHR}-SV" BASE="${OUTDIR}/${SAMPLE}-Col-CC-V2" if [[ ! -s "${IN_FASTQ}" ]]; then echo "ERROR: 输入 FASTQ 不存在或为空: ${IN_FASTQ}" >&2 exit 1 fi echo "[Chr${CHR}] 输入 FASTQ: ${IN_FASTQ}" echo "[Chr${CHR}] SAMPLE : ${SAMPLE}" echo "[Chr${CHR}] BASE 前缀 : ${BASE}" echo ############################################################################# # Round 1:minimap2 map-ont 对原始 FASTQ ############################################################################# R1_PREFIX="${BASE}-r1-mapont" R1_SAM="${R1_PREFIX}.sam" R1_BAM="${R1_PREFIX}.bam" R1_SORTED_BAM="${R1_PREFIX}-sorted.bam" R1_UNMAPPED_BAM="${R1_PREFIX}-sorted-unmapped.bam" R1_UNMAPPED_IDS="${R1_PREFIX}-sorted.unmapped_read_ids.txt" R1_UNMAPPED_IDS_CLEAN="${R1_PREFIX}-sorted.unmapped_read_ids.clean.txt" R1_UNMAPPED_FASTQ="${R1_PREFIX}.unmapped.fq.gz" echo "[Chr${CHR}][R1] minimap2 map-ont: ${IN_FASTQ}" "${MINIMAP2}" -x map-ont -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${IN_FASTQ}" > "${R1_SAM}" echo "[Chr${CHR}][R1] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R1_BAM}" "${R1_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R1_SORTED_BAM}" "${R1_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R1_SORTED_BAM}" echo "[Chr${CHR}][R1] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R1_SORTED_BAM}" -o "${R1_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R1_UNMAPPED_BAM}" | awk '{print $1}' > "${R1_UNMAPPED_IDS}" tr -d '\r' < "${R1_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}" echo "[Chr${CHR}][R1] 未比对 ID 行数:$(wc -l < "${R1_UNMAPPED_IDS_CLEAN}")" echo "[Chr${CHR}][R1] 使用 seqkit 从原始 FASTQ 抽取未比对 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R1_UNMAPPED_IDS_CLEAN}" "${IN_FASTQ}" -o "${R1_UNMAPPED_FASTQ}" echo "[Chr${CHR}][R1] R1_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R1_UNMAPPED_FASTQ}" echo "[Chr${CHR}][R1] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R1_SAM}" "${R1_BAM}" echo ############################################################################# # Round 2:minimap2 asm20,对 R1 未比对 reads ############################################################################# R2_PREFIX="${BASE}-r2-asm20" R2_SAM="${R2_PREFIX}.sam" R2_BAM="${R2_PREFIX}.bam" R2_SORTED_BAM="${R2_PREFIX}-sorted.bam" R2_UNMAPPED_BAM="${R2_PREFIX}-sorted-unmapped.bam" R2_UNMAPPED_IDS="${R2_PREFIX}-sorted.unmapped_read_ids.txt" R2_UNMAPPED_IDS_CLEAN="${R2_PREFIX}-sorted.unmapped_read_ids.clean.txt" R2_UNMAPPED_FASTQ="${R2_PREFIX}.unmapped.fq.gz" echo "[Chr${CHR}][R2] minimap2 asm20: ${R1_UNMAPPED_FASTQ}" "${MINIMAP2}" -x asm20 -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${R1_UNMAPPED_FASTQ}" > "${R2_SAM}" echo "[Chr${CHR}][R2] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R2_BAM}" "${R2_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R2_SORTED_BAM}" "${R2_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R2_SORTED_BAM}" echo "[Chr${CHR}][R2] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R2_SORTED_BAM}" -o "${R2_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R2_UNMAPPED_BAM}" | awk '{print $1}' > "${R2_UNMAPPED_IDS}" tr -d '\r' < "${R2_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}" echo "[Chr${CHR}][R2] 未比对 ID 行数:$(wc -l < "${R2_UNMAPPED_IDS_CLEAN}")" echo "[Chr${CHR}][R2] 使用 seqkit 从 R1_unmapped FASTQ 抽取 R2 未比对 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R2_UNMAPPED_IDS_CLEAN}" "${R1_UNMAPPED_FASTQ}" -o "${R2_UNMAPPED_FASTQ}" echo "[Chr${CHR}][R2] R2_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R2_UNMAPPED_FASTQ}" echo "[Chr${CHR}][R2] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R2_SAM}" "${R2_BAM}" echo ############################################################################# # Round 3:plan A(自定义 map-ont),对 R2 未比对 reads ############################################################################# R3_PREFIX="${BASE}-r3-planA-mapont" R3_SAM="${R3_PREFIX}.sam" R3_BAM="${R3_PREFIX}.bam" R3_SORTED_BAM="${R3_PREFIX}-sorted.bam" R3_UNMAPPED_BAM="${R3_PREFIX}-sorted-unmapped.bam" R3_UNMAPPED_IDS="${R3_PREFIX}-sorted.unmapped_read_ids.txt" R3_UNMAPPED_IDS_CLEAN="${R3_PREFIX}-sorted.unmapped_read_ids.clean.txt" # 最终三轮都未比对上的 reads R3_UNMAPPED_FASTQ="${R3_PREFIX}.unmapped.fq.gz" echo "[Chr${CHR}][R3][planA] minimap2 map-ont with tuned params: ${R2_UNMAPPED_FASTQ}" "${MINIMAP2}" -x map-ont -a --cs=long \ -k13 -w7 \ -r2k,60k -g8k \ -U40,700 \ -s70 -z600,200 \ -N30 -t ${MM2_THREADS} \ "${REF}" "${R2_UNMAPPED_FASTQ}" > "${R3_SAM}" echo "[Chr${CHR}][R3] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R3_BAM}" "${R3_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R3_SORTED_BAM}" "${R3_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R3_SORTED_BAM}" echo "[Chr${CHR}][R3] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R3_SORTED_BAM}" -o "${R3_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R3_UNMAPPED_BAM}" | awk '{print $1}' > "${R3_UNMAPPED_IDS}" tr -d '\r' < "${R3_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R3_UNMAPPED_IDS_CLEAN}" echo "[Chr${CHR}][R3] 未比对 ID 行数:$(wc -l < "${R3_UNMAPPED_IDS_CLEAN}")" echo "[Chr${CHR}][R3] 使用 seqkit 从 R2_unmapped FASTQ 抽取三轮都未比对上的 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R3_UNMAPPED_IDS_CLEAN}" "${R2_UNMAPPED_FASTQ}" -o "${R3_UNMAPPED_FASTQ}" echo "[Chr${CHR}][R3] 最终未比对 FASTQ:${R3_UNMAPPED_FASTQ}" "${SEQKIT}" stats "${R3_UNMAPPED_FASTQ}" echo "[Chr${CHR}][R3] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R3_SAM}" "${R3_BAM}" echo "=== Chr${CHR} 三轮比对完成 ===" echo "保留文件(每一轮):" echo " * ${R1_SORTED_BAM} 及其 .bai" echo " * ${R1_UNMAPPED_BAM}" echo " * ${R2_SORTED_BAM} 及其 .bai" echo " * ${R2_UNMAPPED_BAM}" echo " * ${R3_SORTED_BAM} 及其 .bai" echo " * ${R3_UNMAPPED_BAM}" echo done echo "=== 全部 Chr1/2/3 处理完成 ==="
如果你想让某一轮(比如 R1)额外输出一个简短名字的 SAM(比如 Col-coty-sup-Chr1-SV-cs.sam)做主结果,我也可以在循环里帮你加一个 cp / ln -s 生成这个别名。
#!/usr/bin/env bash
set -euo pipefail
###############################################################################
###############################################################################
MM2_THREADS=36
ST_THREADS=36
###############################################################################
###############################################################################
MINIMAP2="/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2"
SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools"
SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit"
###############################################################################
###############################################################################
REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta"
###############################################################################
###############################################################################
OUTDIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2"
mkdir -p "${OUTDIR}"
###############################################################################
###############################################################################
process_fastq() {
local IN_FASTQ="2"
text# 从输入文件名提取基本信息 local FILENAME=$(basename "${IN_FASTQ}" .fastq) local SAMPLE="Col-coty-sup-${CHROMOSOME}-SV" local BASE="${OUTDIR}/${SAMPLE}-Col-CC-V2" echo "========================================" echo "开始处理: ${IN_FASTQ}" echo "染色体: ${CHROMOSOME}" echo "样本名: ${SAMPLE}" echo "========================================" ############################################################################### # Round 1:minimap2 map-ont 对原始 FASTQ ############################################################################### R1_PREFIX="${BASE}-r1-mapont" R1_SAM="${R1_PREFIX}.sam" R1_BAM="${R1_PREFIX}.bam" R1_SORTED_BAM="${R1_PREFIX}-sorted.bam" R1_UNMAPPED_BAM="${R1_PREFIX}-sorted-unmapped.bam" R1_UNMAPPED_IDS="${R1_PREFIX}-sorted.unmapped_read_ids.txt" R1_UNMAPPED_IDS_CLEAN="${R1_PREFIX}-sorted.unmapped_read_ids.clean.txt" R1_UNMAPPED_FASTQ="${R1_PREFIX}.unmapped.fq.gz" echo "[R1] minimap2 map-ont: ${IN_FASTQ}" "${MINIMAP2}" -x map-ont -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${IN_FASTQ}" > "${R1_SAM}" echo "[R1] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R1_BAM}" "${R1_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R1_SORTED_BAM}" "${R1_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R1_SORTED_BAM}" echo "[R1] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R1_SORTED_BAM}" -o "${R1_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R1_UNMAPPED_BAM}" | awk '{print $1}' > "${R1_UNMAPPED_IDS}" tr -d '\r' < "${R1_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}" echo "[R1] 未比对 ID 行数:$(wc -l < "${R1_UNMAPPED_IDS_CLEAN}")" echo "[R1] 使用 seqkit 从原始 FASTQ 抽取未比对 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R1_UNMAPPED_IDS_CLEAN}" "${IN_FASTQ}" -o "${R1_UNMAPPED_FASTQ}" echo "[R1] R1_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R1_UNMAPPED_FASTQ}" # R1 只删除原始 SAM 和未排序 BAM echo "[R1] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R1_SAM}" "${R1_BAM}" ############################################################################### # Round 2:minimap2 asm20,对 R1 未比对 reads ############################################################################### R2_PREFIX="${BASE}-r2-asm20" R2_SAM="${R2_PREFIX}.sam" R2_BAM="${R2_PREFIX}.bam" R2_SORTED_BAM="${R2_PREFIX}-sorted.bam" R2_UNMAPPED_BAM="${R2_PREFIX}-sorted-unmapped.bam" R2_UNMAPPED_IDS="${R2_PREFIX}-sorted.unmapped_read_ids.txt" R2_UNMAPPED_IDS_CLEAN="${R2_PREFIX}-sorted.unmapped_read_ids.clean.txt" R2_UNMAPPED_FASTQ="${R2_PREFIX}.unmapped.fq.gz" echo "[R2] minimap2 asm20: ${R1_UNMAPPED_FASTQ}" "${MINIMAP2}" -x asm20 -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${R1_UNMAPPED_FASTQ}" > "${R2_SAM}" echo "[R2] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R2_BAM}" "${R2_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R2_SORTED_BAM}" "${R2_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R2_SORTED_BAM}" echo "[R2] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R2_SORTED_BAM}" -o "${R2_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R2_UNMAPPED_BAM}" | awk '{print $1}' > "${R2_UNMAPPED_IDS}" tr -d '\r' < "${R2_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}" echo "[R2] 未比对 ID 行数:$(wc -l < "${R2_UNMAPPED_IDS_CLEAN}")" echo "[R2] 使用 seqkit 从 R1_unmapped FASTQ 抽取 R2 未比对 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R2_UNMAPPED_IDS_CLEAN}" "${R1_UNMAPPED_FASTQ}" -o "${R2_UNMAPPED_FASTQ}" echo "[R2] R2_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R2_UNMAPPED_FASTQ}" # R2 只删除原始 SAM 和未排序 BAM echo "[R2] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R2_SAM}" "${R2_BAM}" ############################################################################### # Round 3:plan A(自定义 map-ont),对 R2 未比对 reads ############################################################################### R3_PREFIX="${BASE}-r3-planA-mapont" R3_SAM="${R3_PREFIX}.sam" R3_BAM="${R3_PREFIX}.bam" R3_SORTED_BAM="${R3_PREFIX}-sorted.bam" R3_UNMAPPED_BAM="${R3_PREFIX}-sorted-unmapped.bam" R3_UNMAPPED_IDS="${R3_PREFIX}-sorted.unmapped_read_ids.txt" R3_UNMAPPED_IDS_CLEAN="${R3_PREFIX}-sorted.unmapped_read_ids.clean.txt" # 最终三轮都未比对上的 reads R3_UNMAPPED_FASTQ="${R3_PREFIX}.unmapped.fq.gz" echo "[R3][planA] minimap2 map-ont with tuned params: ${R2_UNMAPPED_FASTQ}" "${MINIMAP2}" -x map-ont -a --cs=long \ -k13 -w7 \ -r2k,60k -g8k \ -U40,700 \ -s70 -z600,200 \ -N30 -t ${MM2_THREADS} \ "${REF}" "${R2_UNMAPPED_FASTQ}" > "${R3_SAM}" echo "[R3] SAM -> BAM & sort" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -o "${R3_BAM}" "${R3_SAM}" "${SAMTOOLS}" sort -@ ${ST_THREADS} -o "${R3_SORTED_BAM}" "${R3_BAM}" "${SAMTOOLS}" index -@ ${ST_THREADS} "${R3_SORTED_BAM}" echo "[R3] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ ${ST_THREADS} -b -f 4 "${R3_SORTED_BAM}" -o "${R3_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ ${ST_THREADS} "${R3_UNMAPPED_BAM}" | awk '{print $1}' > "${R3_UNMAPPED_IDS}" tr -d '\r' < "${R3_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R3_UNMAPPED_IDS_CLEAN}" echo "[R3] 未比对 ID 行数:$(wc -l < "${R3_UNMAPPED_IDS_CLEAN}")" echo "[R3] 使用 seqkit 从 R2_unmapped FASTQ 抽取三轮都未比对上的 reads" "${SEQKIT}" grep -j ${ST_THREADS} -f "${R3_UNMAPPED_IDS_CLEAN}" "${R2_UNMAPPED_FASTQ}" -o "${R3_UNMAPPED_FASTQ}" echo "[R3] 最终未比对 FASTQ:${R3_UNMAPPED_FASTQ}" "${SEQKIT}" stats "${R3_UNMAPPED_FASTQ}" # R3 只删除原始 SAM 和未排序 BAM echo "[R3] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R3_SAM}" "${R3_BAM}" echo "=== ${CHROMOSOME} 处理完成 ===" echo "保留文件:" echo " * ${R1_SORTED_BAM} 及其 .bai" echo " * ${R1_UNMAPPED_BAM}" echo " * ${R2_SORTED_BAM} 及其 .bai" echo " * ${R2_UNMAPPED_BAM}" echo " * ${R3_SORTED_BAM} 及其 .bai" echo " * ${R3_UNMAPPED_BAM}" echo ""
}
###############################################################################
###############################################################################
FASTQ_DIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq"
for CHR in Chr1 Chr2 Chr3; do
IN_FASTQ="{CHR}-SV.fastq"
text# 检查文件是否存在 if [[ ! -f "${IN_FASTQ}" ]]; then echo "错误:文件不存在: ${IN_FASTQ}" continue fi # 调用处理函数 process_fastq "${IN_FASTQ}" "${CHR}"
done
echo "=== 所有染色体处理完成 ===" 请你帮忙仔细检查这段代码,并告诉我他的逻辑功能,以及能否正常运行
先说结论:
这段脚本逻辑是清晰的,能实现你说的“三个 Chr 文件依次跑三轮比对”的流程;只要路径、软件都存在,就可以正常运行。下面我帮你把它拆开讲一下具体做了什么,同时指出几个可以注意的小细节(不是致命错误)。
全局设置
set -euo pipefail:遇到错误立即退出、未定义变量报错、管道中任一命令出错都视为失败(安全)。MM2_THREADS=36(minimap2),ST_THREADS=36(samtools / seqkit)。MINIMAP2="/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2"SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools"SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit"REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta"OUTDIR="/.../minimap2-Col-CC-V2",并 mkdir -p。定义一个函数 process_fastq IN_FASTQ CHROMOSOME
IN_FASTQ:完整路径,如 .../Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastqCHROMOSOME:Chr1 / Chr2 / Chr3SAMPLE="Col-coty-sup-${CHROMOSOME}-SV"Col-coty-sup-Chr1-SVBASE="${OUTDIR}/${SAMPLE}-Col-CC-V2"/…/Col-coty-sup-Chr1-SV-Col-CC-V2主程序
FASTQ_DIR="/.../SV-test-fastq"for CHR in Chr1 Chr2 Chr3; do ... done
IN_FASTQ="${FASTQ_DIR}/Col-coty-dorado-minimap2-Col-CC-V2-${CHR}-SV.fastq"
→ 即:
Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastqCol-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastqCol-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastqcontinue(跳过这个 Chr)。process_fastq "${IN_FASTQ}" "${CHR}"在 process_fastq() 里,对每个输入 FASTQ 执行完全一样的流程,只是 CHROMOSOME/SAMPLE/BASE 不同。
map-ont 对原始 FASTQR1_PREFIX="${BASE}-r1-mapont"R1_SAM="${R1_PREFIX}.sam"R1_SORTED_BAM="${R1_PREFIX}-sorted.bam"R1_UNMAPPED_BAM="${R1_PREFIX}-sorted-unmapped.bam"R1_UNMAPPED_IDS, R1_UNMAPPED_IDS_CLEANR1_UNMAPPED_FASTQ="${R1_PREFIX}.unmapped.fq.gz"流程:
比对:
bash"${MINIMAP2}" -x map-ont -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${IN_FASTQ}" > "${R1_SAM}"
--cs=long,没有 --MD,符合你说的“这两个 tag 冲突,去掉 --MD”。SAM → BAM → sort → index:
bashsamtools view -@ ST_THREADS -b -o R1_BAM R1_SAM samtools sort -@ ST_THREADS -o R1_SORTED_BAM R1_BAM samtools index -@ ST_THREADS R1_SORTED_BAM
提取未比对 reads(flag 4):
bashsamtools view -@ ST_THREADS -b -f 4 R1_SORTED_BAM -o R1_UNMAPPED_BAM samtools view -@ ST_THREADS R1_UNMAPPED_BAM | awk '{print $1}' > R1_UNMAPPED_IDS tr -d '\r' < R1_UNMAPPED_IDS | awk 'length($0)>0' | LC_ALL=C sort -u > R1_UNMAPPED_IDS_CLEAN
用 seqkit 从原始 FASTQ 抽取未比对 reads:
bashseqkit grep -j ST_THREADS -f R1_UNMAPPED_IDS_CLEAN IN_FASTQ -o R1_UNMAPPED_FASTQ
统计 + 清理:
seqkit stats R1_UNMAPPED_FASTQR1_SAM 和 R1_BAM,保留 sorted.bam / bai / unmapped.bam 和 unmapped.fq.gz。asm20 对 R1 未比对 FASTQR2_PREFIX="${BASE}-r2-asm20"R1_UNMAPPED_FASTQ-x asm20R1_unmapped.fq.gz 抽 R2 未比对 reads,写到 R2_UNMAPPED_FASTQ。关键比对命令:
bash"${MINIMAP2}" -x asm20 -a --cs=long \ -t ${MM2_THREADS} \ "${REF}" "${R1_UNMAPPED_FASTQ}" > "${R2_SAM}"
同样只有 --cs=long。
前缀:R3_PREFIX="${BASE}-r3-planA-mapont"
输入 FASTQ:R2_UNMAPPED_FASTQ
参数更激进:
bash"${MINIMAP2}" -x map-ont -a --cs=long \ -k13 -w7 \ -r2k,60k -g8k \ -U40,700 \ -s70 -z600,200 \ -N30 -t ${MM2_THREADS} \ "${REF}" "${R2_UNMAPPED_FASTQ}" > "${R3_SAM}"
后续流程同样:
R3_UNMAPPED_FASTQ(三轮都没比上的 reads)seqkit stats R3_UNMAPPED_FASTQR3_SAM & R3_BAM最终保留的文件(每个 Chr):
BASE-r1-mapont-sorted.bam 及 .baiBASE-r1-mapont-sorted-unmapped.bamBASE-r2-asm20-sorted.bam 及 .baiBASE-r2-asm20-sorted-unmapped.bamBASE-r3-planA-mapont-sorted.bam 及 .baiBASE-r3-planA-mapont-sorted-unmapped.bamBASE-r3-planA-mapont.unmapped.fq.gz(三轮的 unmapped.fq.gz 也都会保留)
在下面前提满足情况下,脚本可以正常运行:
.../Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastq.../Col-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastq.../Col-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastqMINIMAP2, SAMTOOLS, SEQKIT 路径都有效且具有执行权限。REF 存在且格式正确。逻辑上没有明显 bug,变量在使用前都被赋值,set -u 不会踩雷。
变量 local 化(风格问题,不影响正确性)
在 process_fastq 里只给 IN_FASTQ 和 CHROMOSOME 加了 local,
其它比如 R1_PREFIX / R2_PREFIX / R3_PREFIX 其实也可以 local,防止将来函数里面又调用其它函数时变量相互污染。现在这种写法在顺序单线程执行时也不会出错,只是代码更“干净”会更好。
FILENAME 没有被使用
bashlocal FILENAME=$(basename "${IN_FASTQ}" .fastq)
这个变量后面没有用到,可以删掉,或者将来用它拼更复杂的名字。
0 个未比对 reads 的情况
R*_UNMAPPED_IDS_CLEAN 会是空文件;seqkit grep -f 空文件 会输出一个 0 read 的 FASTQ(或空文件);bashif [[ ! -s "${R1_UNMAPPED_IDS_CLEAN}" ]]; then echo "R1 已经全部比对上,跳过 R2/R3" return 0 fi
这属于优化,不是必须。
输出是否需要一个最终 SAM 名字(比如你最早说的 ...-cs.sam)
当前脚本会把所有 *.sam 在对应轮次结束后删除,只留下 BAM 和 unmapped FASTQ。
如果你确实还想要一个类似:
text/.../Col-coty-sup-Chr1-SV-cs.sam
作为“主结果”,可以在 R1 比对后加一行:
bashcp "${R1_SAM}" "${OUTDIR}/${SAMPLE}-cs.sam"
再删 R1_SAM。
现在的脚本逻辑里没有这个需求就不用加。
功能概括:
map-ont 初次比对 → 提取未比对 reads;asm20 再比对;map-ont 参数再比对;--cs=long,不再使用 --MD。是否能正常运行:
如果你愿意,我可以在这个版本基础上再加一个“如果某一轮完全匹配就跳过后面轮次”的逻辑,或者帮你加上那个 Col-coty-sup-ChrX-SV-cs.sam 的最终输出。
什么叫某一轮完全匹配就跳过后面轮次
我刚才说的“某一轮完全匹配就跳过后面轮次”,其实是一个可选优化思路,不是你脚本现在必须做的事。具体意思是:
对每个 Chr,你现在是:
R1:map-ont
R1_SORTED_BAMR1_UNMAPPED_FASTQR2:asm20
R1_UNMAPPED_FASTQ(只含 R1 里没比上的 reads)R2_SORTED_BAMR2_UNMAPPED_FASTQR3:planA map-ont(调参)
R2_UNMAPPED_FASTQR3_SORTED_BAMR3_UNMAPPED_FASTQ不管每一轮有多少未比对 reads,脚本都会继续跑到下一轮。
举例说 R1:
R1_UNMAPPED_IDS_CLEANbashR1_UNMAPPED_IDS_CLEAN # 文件存在但大小为 0(空文件)
也就是说:
这一轮里,已经没有任何未比对的 reads 了。
那这种情况,我就称之为:
“R1 完全匹配”,也可以说“R1 已经把所有 reads 都对上了”。
同理,如果 R2 的未比对 ID 文件 R2_UNMAPPED_IDS_CLEAN 也完全空了,就可以说:
“R2 完全匹配”。
假设 R1 就已经全部匹配:
R1_UNMAPPED_IDS_CLEAN 是空的R1_UNMAPPED_FASTQ 里也就没有 reads 或者是 0 条这时继续往下做:
结果:
所以,优化思路是:
同理,如果:
R2_UNMAPPED_IDS_CLEAN 为空):
注意:这只是“锦上添花”的优化,你现在脚本不加也能正常跑。
以 R1 为例,在 process_fastq 里的这几行后面:
bashtr -d '\r' < "${R1_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}" echo "[R1] 未比对 ID 行数:$(wc -l < "${R1_UNMAPPED_IDS_CLEAN}")"
可以加一段判断:
bash# 如果这一轮已经没有未比对 reads,就不用跑 R2/R3 了 if [[ ! -s "${R1_UNMAPPED_IDS_CLEAN}" ]]; then echo "[R1] 所有 reads 在 R1 中都已比对上,跳过 R2/R3。" # 这里仍然可以保留 R1 的统计结果 "${SEQKIT}" stats "${IN_FASTQ}" || true # 不再进行后续轮次,直接结束这个染色体的处理 return 0 fi
类似地,在 R2 结束后也可以加:
bashtr -d '\r' < "${R2_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}" echo "[R2] 未比对 ID 行数:$(wc -l < "${R2_UNMAPPED_IDS_CLEAN}")" if [[ ! -s "${R2_UNMAPPED_IDS_CLEAN}" ]]; then echo "[R2] 所有剩余 reads 在 R2 中都已比对上,跳过 R3。" "${SEQKIT}" stats "${R1_UNMAPPED_FASTQ}" || true return 0 fi
这样逻辑就是:
“某一轮完全匹配就跳过后面轮次”
就是:如果某一轮比对之后,已经没有任何未比对的 reads(未比对 ID 文件是空的),那再继续往后跑就没有意义,可以直接结束这条染色体的分析,节省时间和资源。
你现在的脚本没有实现这个“跳过”的优化,而是无论如何都会跑完 R1→R2→R3 三轮——这是完全没问题的,只是稍微多用点算力而已。