#PBS -N minimap2-Col-coty #PBS -l nodes=node03:ppn...

Created on: December 9, 2025

Answered using GPT-5.1 Thinking by Chat01

Question

#PBS -N minimap2-Col-coty
#PBS -l nodes=node03:ppn=10
#PBS -V
#PBS -j oe
#PBS -l mem=10g
#PBS -l walltime=399:59:00
cd $PBS_O_WORKDIR

set -euo pipefail

/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2
-x map-ont
-a
--cs=long
-t 36
/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta
/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq/Col-coty-dorado-minimap2-Col-CC-V2-Chr1-SV.fastq \

/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/Col-coty-sup-Chr1-SV-cs.sam
SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools" 请你接着把这个代码写完成,我们Col-coty-dorado-minimap2-Col-CC-V2-Chr2-SV.fastq Col-coty-dorado-minimap2-Col-CC-V2-Chr3-SV.fastq 后面需要把这两个比对也完成,同时samtools 转成bam文件,并进行sort 以及 index set -euo pipefail

线程数

MM2_THREADS=36
ST_THREADS=36

软件绝对路径(minimap2 在环境里,直接调用)

SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools"
SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit"

参考基因组(Col-CC-V2)

REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta"

原始输入 FASTQ(如有更新,只改这行即可)

IN_FASTQ="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/fastq/Col-cotyledon-dor-pass-fail-sup.fastq"

输出目录

OUTDIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2"
mkdir -p "${OUTDIR}"

sample 名,用来拼接输出文件前缀

SAMPLE="Col-coty-dorado"
BASE="OUTDIR/{OUTDIR}/{SAMPLE}-Col-CC-V2"

###############################################################################

Round 1:minimap2 map-ont 对原始 FASTQ

###############################################################################
R1_PREFIX="BASEr1mapont"R1SAM="{BASE}-r1-mapont" R1_SAM="{R1_PREFIX}.sam"
R1_BAM="R1PREFIX.bam"R1SORTEDBAM="{R1_PREFIX}.bam" R1_SORTED_BAM="{R1_PREFIX}-sorted.bam"
R1_UNMAPPED_BAM="R1PREFIXsortedunmapped.bam"R1UNMAPPEDIDS="{R1_PREFIX}-sorted-unmapped.bam" R1_UNMAPPED_IDS="{R1_PREFIX}-sorted.unmapped_read_ids.txt"
R1_UNMAPPED_IDS_CLEAN="R1PREFIXsorted.unmappedreadids.clean.txt"R1UNMAPPEDFASTQ="{R1_PREFIX}-sorted.unmapped_read_ids.clean.txt" R1_UNMAPPED_FASTQ="{R1_PREFIX}.unmapped.fq.gz"

echo "[R1] minimap2 map-ont: INFASTQ"minimap2xmapontaMDcs=long t{IN_FASTQ}" minimap2 -x map-ont -a --MD --cs=long \ -t {MM2_THREADS}
"REF""{REF}" "{IN_FASTQ}" > "${R1_SAM}"

echo "[R1] SAM -> BAM & sort"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} -b -o "R1BAM""{R1_BAM}" "{R1_SAM}"
"SAMTOOLS"sort@{SAMTOOLS}" sort -@ {ST_THREADS} -o "R1SORTEDBAM""{R1_SORTED_BAM}" "{R1_BAM}"
"SAMTOOLS"index@{SAMTOOLS}" index -@ {ST_THREADS} "${R1_SORTED_BAM}"

echo "[R1] 提取未比对 reads 的 ID (flag 4)"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} -b -f 4 "R1SORTEDBAM"o"{R1_SORTED_BAM}" -o "{R1_UNMAPPED_BAM}"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} "{R1_UNMAPPED_BAM}" | awk '{print 1}' > "${R1_UNMAPPED_IDS}"

tr -d '\r' < "R1UNMAPPEDIDS"awklength({R1_UNMAPPED_IDS}" | awk 'length(0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}"

echo "[R1] 未比对 ID 行数:(wcl<"(wc -l < "{R1_UNMAPPED_IDS_CLEAN}")"

echo "[R1] 使用 seqkit 从原始 FASTQ 抽取未比对 reads"

这里只提取 ID(QNAME),seqkit 默认按 ID 匹配,不用 -n

"SEQKIT"grepj{SEQKIT}" grep -j {ST_THREADS} -f "R1UNMAPPEDIDSCLEAN""{R1_UNMAPPED_IDS_CLEAN}" "{IN_FASTQ}" -o "${R1_UNMAPPED_FASTQ}"

echo "[R1] R1_unmapped FASTQ 统计:"
"SEQKIT"stats"{SEQKIT}" stats "{R1_UNMAPPED_FASTQ}"

R1 只删除原始 SAM 和未排序 BAM

echo "[R1] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam"
rm -f "R1SAM""{R1_SAM}" "{R1_BAM}"

###############################################################################

Round 2:minimap2 asm20,对 R1 未比对 reads

###############################################################################
R2_PREFIX="BASEr2asm20"R2SAM="{BASE}-r2-asm20" R2_SAM="{R2_PREFIX}.sam"
R2_BAM="R2PREFIX.bam"R2SORTEDBAM="{R2_PREFIX}.bam" R2_SORTED_BAM="{R2_PREFIX}-sorted.bam"
R2_UNMAPPED_BAM="R2PREFIXsortedunmapped.bam"R2UNMAPPEDIDS="{R2_PREFIX}-sorted-unmapped.bam" R2_UNMAPPED_IDS="{R2_PREFIX}-sorted.unmapped_read_ids.txt"
R2_UNMAPPED_IDS_CLEAN="R2PREFIXsorted.unmappedreadids.clean.txt"R2UNMAPPEDFASTQ="{R2_PREFIX}-sorted.unmapped_read_ids.clean.txt" R2_UNMAPPED_FASTQ="{R2_PREFIX}.unmapped.fq.gz"

echo "[R2] minimap2 asm20: R1UNMAPPEDFASTQ"minimap2xasm20aMDcs=long t{R1_UNMAPPED_FASTQ}" minimap2 -x asm20 -a --MD --cs=long \ -t {MM2_THREADS}
"REF""{REF}" "{R1_UNMAPPED_FASTQ}" > "${R2_SAM}"

echo "[R2] SAM -> BAM & sort"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} -b -o "R2BAM""{R2_BAM}" "{R2_SAM}"
"SAMTOOLS"sort@{SAMTOOLS}" sort -@ {ST_THREADS} -o "R2SORTEDBAM""{R2_SORTED_BAM}" "{R2_BAM}"
"SAMTOOLS"index@{SAMTOOLS}" index -@ {ST_THREADS} "${R2_SORTED_BAM}"

echo "[R2] 提取未比对 reads 的 ID (flag 4)"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} -b -f 4 "R2SORTEDBAM"o"{R2_SORTED_BAM}" -o "{R2_UNMAPPED_BAM}"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} "{R2_UNMAPPED_BAM}" | awk '{print 1}' > "${R2_UNMAPPED_IDS}"

tr -d '\r' < "R2UNMAPPEDIDS"awklength({R2_UNMAPPED_IDS}" | awk 'length(0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}"

echo "[R2] 未比对 ID 行数:(wcl<"(wc -l < "{R2_UNMAPPED_IDS_CLEAN}")"

echo "[R2] 使用 seqkit 从 R1_unmapped FASTQ 抽取 R2 未比对 reads"
"SEQKIT"grepj{SEQKIT}" grep -j {ST_THREADS} -f "R2UNMAPPEDIDSCLEAN""{R2_UNMAPPED_IDS_CLEAN}" "{R1_UNMAPPED_FASTQ}" -o "${R2_UNMAPPED_FASTQ}"

echo "[R2] R2_unmapped FASTQ 统计:"
"SEQKIT"stats"{SEQKIT}" stats "{R2_UNMAPPED_FASTQ}"

R2 只删除原始 SAM 和未排序 BAM

echo "[R2] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam"
rm -f "R2SAM""{R2_SAM}" "{R2_BAM}"

###############################################################################

Round 3:plan A(自定义 map-ont),对 R2 未比对 reads

###############################################################################
R3_PREFIX="BASEr3planAmapont"R3SAM="{BASE}-r3-planA-mapont" R3_SAM="{R3_PREFIX}.sam"
R3_BAM="R3PREFIX.bam"R3SORTEDBAM="{R3_PREFIX}.bam" R3_SORTED_BAM="{R3_PREFIX}-sorted.bam"
R3_UNMAPPED_BAM="R3PREFIXsortedunmapped.bam"R3UNMAPPEDIDS="{R3_PREFIX}-sorted-unmapped.bam" R3_UNMAPPED_IDS="{R3_PREFIX}-sorted.unmapped_read_ids.txt"
R3_UNMAPPED_IDS_CLEAN="R3PREFIXsorted.unmappedreadids.clean.txt"R3UNMAPPEDFASTQ="{R3_PREFIX}-sorted.unmapped_read_ids.clean.txt" R3_UNMAPPED_FASTQ="{R3_PREFIX}.unmapped.fq.gz" # 最终三轮都未比对上的 reads

echo "[R3][planA] minimap2 map-ont with tuned params: R2UNMAPPEDFASTQ"minimap2xmapontaMDcs=long k13w7 r2k,60kg8k U40,700 s70z600,200 N30t{R2_UNMAPPED_FASTQ}" minimap2 -x map-ont -a --MD --cs=long \ -k13 -w7 \ -r2k,60k -g8k \ -U40,700 \ -s70 -z600,200 \ -N30 -t {MM2_THREADS}
"REF""{REF}" "{R2_UNMAPPED_FASTQ}" > "${R3_SAM}"

echo "[R3] SAM -> BAM & sort"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} -b -o "R3BAM""{R3_BAM}" "{R3_SAM}"
"SAMTOOLS"sort@{SAMTOOLS}" sort -@ {ST_THREADS} -o "R3SORTEDBAM""{R3_SORTED_BAM}" "{R3_BAM}"
"SAMTOOLS"index@{SAMTOOLS}" index -@ {ST_THREADS} "${R3_SORTED_BAM}"

echo "[R3] 提取未比对 reads 的 ID (flag 4)"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} -b -f 4 "R3SORTEDBAM"o"{R3_SORTED_BAM}" -o "{R3_UNMAPPED_BAM}"
"SAMTOOLS"view@{SAMTOOLS}" view -@ {ST_THREADS} "{R3_UNMAPPED_BAM}" | awk '{print 1}' > "${R3_UNMAPPED_IDS}"

tr -d '\r' < "R3UNMAPPEDIDS"awklength({R3_UNMAPPED_IDS}" | awk 'length(0)>0' | LC_ALL=C sort -u > "${R3_UNMAPPED_IDS_CLEAN}"

echo "[R3] 未比对 ID 行数:(wcl<"(wc -l < "{R3_UNMAPPED_IDS_CLEAN}")"

echo "[R3] 使用 seqkit 从 R2_unmapped FASTQ 抽取三轮都未比对上的 reads"
"SEQKIT"grepj{SEQKIT}" grep -j {ST_THREADS} -f "R3UNMAPPEDIDSCLEAN""{R3_UNMAPPED_IDS_CLEAN}" "{R2_UNMAPPED_FASTQ}" -o "${R3_UNMAPPED_FASTQ}"

echo "[R3] 最终未比对 FASTQ:R3UNMAPPEDFASTQ""{R3_UNMAPPED_FASTQ}" "{SEQKIT}" stats "${R3_UNMAPPED_FASTQ}"

R3 只删除原始 SAM 和未排序 BAM

echo "[R3] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam"
rm -f "R3SAM""{R3_SAM}" "{R3_BAM}"

echo "=== 全流程完成(map-ont → asm20 → planA-mapont;三轮均未比对的 reads 已输出)==="
echo "保留文件(每一轮):"
echo " * R1SORTEDBAM及其.bai"echo"{R1_SORTED_BAM} 及其 .bai" echo " * {R1_UNMAPPED_BAM}"
echo " * R2SORTEDBAM及其.bai"echo"{R2_SORTED_BAM} 及其 .bai" echo " * {R2_UNMAPPED_BAM}"
echo " * R3SORTEDBAM及其.bai"echo"{R3_SORTED_BAM} 及其 .bai" echo " * {R3_UNMAPPED_BAM}" 同时都使用三轮比对来完成这个比对任务

Thought

Responding to code request in Chinese

The user wants to continue a PBS script for mapping both Chr2 and Chr3 FASTQ files with minimap2, similar to Chr1. Then, they want to process all three files, converting SAM to BAM and sorting/indexing them using samtools. And they'll be using a three-round alignment process for each data file.

Clarifying the user’s multi-step code request

The user wants to extend the given PBS script by aligning Chr2 and Chr3 fastqs with minimap2, just like Chr1. Then, they want to convert all three outputs (SAM) to BAM, followed by sorting and indexing with samtools. They’re presenting another script that includes multi-round mapping for the alignment process.

Clarifying user’s multi-round alignment task

The user seems to want two separate things: first, extending the existing Chr1/2/3 script to align those specific fastqs, and second, applying a separate three-round mapping pipeline to handle whole-genome tasks. They may want to unify these by using the three-round alignment for each of the Chr1/2/3 fastqs as well.

Interpreting the user's request clearly

The user seems to have mostly completed the three-round mapping script but needs help ensuring it's correct or applied properly. They’ve pasted a detailed script and mentioned the three-round pipeline, which likely applies to it. Based on system instructions, I’ll assume they need a full PBS bash script with defined options, variables, and the three-round alignment.

Considering script integration and modifications

The user has shared a multi-round mapping script and a separate PBS script for aligning individual chromosomes. They likely want to combine these, using the three-round pipeline for each chromosome-specific FASTQ. Specifically, they'd like the pipeline to handle Chr1, Chr2, and Chr3 files by adapting it to loop over these inputs.

Interpreting script modifications and requests

It seems the user is asking for help to modify a script for aligning two additional chromosome-specific fastqs using the 3-round pipeline, then convert them to sorted BAM files. They might want to adapt an existing 3-round mapping pipeline for these fastqs, or they could be presenting a separate task and just need assistance completing a new job script.

Clarifying task goals and proposed solution

It looks like the user wants to use the 3-round mapping pipeline for way more samples than what the second script handles right now. They may want to adapt the pipeline to process Chr1, Chr2, and Chr3-SV fastqs, using loops to handle the multiple files, and possibly incorporating the 3-round mapping. I'll suggest a combined solution with this approach.

Creating unified mapping pipeline script

I plan to create a unified script using a PBS header, arrays for Chr1, Chr2, and Chr3, and adapt the 3-round pipeline for multiple chr FASTQ files. I'll set the output to a matching OUTDIR and ensure the SAMtools operations for Chr2/Chr3 are integrated too. I'll then provide the second script with a looping structure to extend the 3-round pipeline for these inputs.

Focusing primarily on the first script

I think the user mainly wants me to continue with the first script. They've given some context with the second script, but it seems like that may just be their progress or reference, not a request. Since their main ask involves aligning Chr2 and Chr3 fastqs with samtools conversions, I’ll focus on completing the first script with the specified changes.

Creating the PBS script to map per-chr FASTQs

I’m going to focus on providing the full PBS script they asked for, improving it with the necessary adjustments for Chr2 and Chr3 SV FASTQs. I'll use minimap2 with the appropriate flags, and SAMTOOLS for conversion. As for threading, I’ll respect their preference for -t 36 but will set it to 10 threads based on shared PBS resources.

Clarifying multi-round mapping and providing final script

I'll stick to providing the complete PBS script that processes Chr2 and Chr3, mapping them with minimap2 and handling SAM conversion with SAMTOOLS. I'll include the full script for one-round mapping, respecting their chosen thread count for consistency. Later, I’ll briefly discuss how to adapt their three-round pipeline script for per-chr SV fastqs if they wish to expand later.

Thought for 4m 41s

Answer

先把你最前面的 PBS 脚本补全一下:
对 Chr1 / Chr2 / Chr3 三个 SV.fastq 都跑 minimap2,然后用 samtools 转 BAM、sort 和 index。
我顺手把路径和命名都参数化了一下,方便以后改。

bash
#!/bin/bash #PBS -N minimap2-Col-coty #PBS -l nodes=node03:ppn=10 #PBS -V #PBS -j oe #PBS -l mem=10g #PBS -l walltime=399:59:00 cd "$PBS_O_WORKDIR" set -euo pipefail # 线程数(根据实际资源改) THREADS=36 # 软件路径 MINIMAP2="/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2" SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools" # 参考基因组 REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta" # 输入 / 输出目录 IN_DIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq" OUT_DIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2" mkdir -p "${OUT_DIR}" # 依次处理 Chr1 / Chr2 / Chr3 for chr in Chr1 Chr2 Chr3; do IN_FASTQ="${IN_DIR}/Col-coty-dorado-minimap2-Col-CC-V2-${chr}-SV.fastq" # 输出前缀与原来 Chr1 保持一致:Col-coty-sup-ChrX-SV-cs.* PREFIX="${OUT_DIR}/Col-coty-sup-${chr}-SV-cs" SAM="${PREFIX}.sam" BAM="${PREFIX}.bam" SORTED_BAM="${PREFIX}.sorted.bam" echo "[$(date)] minimap2 对 ${chr} 开始比对:${IN_FASTQ}" "${MINIMAP2}" \ -x map-ont \ -a \ --cs=long \ -t "${THREADS}" \ "${REF}" \ "${IN_FASTQ}" \ > "${SAM}" echo "[$(date)] samtools view/sort/index 处理 ${chr}" # SAM -> BAM "${SAMTOOLS}" view -@ "${THREADS}" -b -o "${BAM}" "${SAM}" # sort "${SAMTOOLS}" sort -@ "${THREADS}" -o "${SORTED_BAM}" "${BAM}" # index "${SAMTOOLS}" index -@ "${THREADS}" "${SORTED_BAM}" # 可选:节省空间,删除中间文件 rm -f "${SAM}" "${BAM}" echo "[$(date)] ${chr} 处理完成:${SORTED_BAM} 以及 ${SORTED_BAM}.bai" done echo "[$(date)] 所有 Chr1 / Chr2 / Chr3 minimap2 + samtools 流程完成。"

关于你后面的“三轮比对”脚本

你后面贴的那个大脚本(R1: map-ont → R2: asm20 → R3: planA-mapont,加上 unmapped 提取)本身已经是一个完整的三轮比对流程了,直接保存成一个脚本单样本跑就可以完成“用三轮比对完成这个比对任务”。

如果你也想让 多个 FASTQ(比如不同子集)都跑三轮比对,可以把那段 R1/R2/R3 流程封装成函数,再在一个 for 循环里传不同的 IN_FASTQBASE 进去即可——思路和我上面对 Chr1/2/3 的写法是一样的。需要的话我也可以帮你把三轮流程改成可复用函数版。

Question

是的,我需要每个都进行三轮比对,请你照着这个意思进行修改代码

Thought

Planning multi-input mapping script

It looks like the user wants to adjust their pipeline for multiple FASTQ files. They previously solved single-chromosome cases, and now they want to generalize it to handle multiple chromosomes with the three-round mapping pipeline. I'll write a Bash script that loops over each FASTQ file, runs the three rounds, and converts to BAM, ensuring proper sorting and indexing. I'll keep the script bilingual, considering the user's use of Chinese, and avoid asking for more clarification.

Adapting pipeline for per-chromosome FASTQs

Okay, so the user's second script works for a single FASTQ input, but now, they want to adapt it for multiple per-chromosome inputs. We can modify that script to loop through per-chromosome FASTQs, apply the same 3-round mapping, and use the same reference genome. I’ll generalize it so they can supply different input FASTQs and output directories while keeping track of the prefix and naming pattern.

Maintaining parameter consistency and finalizing script

I need to keep the mm2/asm20/planA parameters exactly as in the user script, like using "minimap2" directly without the full path. The earlier script called it with a specific path, but since it's in the environment, I should just call "minimap2". Now, I’ll rewrite the PBS script with new HPC directives and ensure it works for both raw sample FASTQs and per-chromosome SV FASTQs.

Unifying the 3-round pipeline

It seems the user wants me to create one script that can handle both the global 3-round pipeline for a whole sample and the separate 3-round process for each per-chromosome SV FASTQ. I’ll merge both, so each of the chromosome SV FASTQs undergoes the same 3-round mapping. This will replace the previous single-round version.

Writing the modified 3-round script

I’m working on a script that loops through both the whole sample and per-chromosome SV FASTQs. I'll set up a function for the 3-round process, allowing me to call it for each input file while defining necessary variables. I’ll also make sure to handle paths and avoid naming conflicts, keeping variables local in the function. One thing to watch is matching the thread allocation in the PBS script while maintaining the user’s desired settings.

Unifying the mapping script

The user had initially used a direct path for minimap2, but now prefers the standard "minimap2" from the environment. I'll modify the script to accommodate this and switch to a more generic structure. Since they want each FASTQ file to go through the 3-round mapping, I’ll provide a script with functions and loops, ensuring flexibility without adding unwanted PBS headers. The user can integrate it into their PBS environment as needed.

Integrating multi-round script with PBS header

Since the user confirmed they want a 3-round mapping for each chromosome FASTQ, I'll take the stand-alone script and modify it to integrate the multi-round approach with proper PBS directives. The script will handle Chr1/2/3 with the 3-round function, and I'll ensure the code is well-commented, with thoughtful variable naming and indentation for clarity.

Setting up the 3-round mapping script

I've defined the entire 3-round mapping function, where I handle R1, R2, and R3 using the MINIMAP2 variable. The function processes both full FASTQ samples and chromosome-specific SV subsets, following the user's guidelines. I'll keep variable quoting and expansions correct, and reuse the base path for output. I’ll ensure everything is consistent with the user’s initial file names and sample identifiers.

Implementing R2 and R3 with correct variables and header decisions

I'll implement R2 and R3 using the provided mapping settings and ensure that the relevant variables, such as MINIMAP2 and SEQKIT, are correctly included. For the PBS header, I'll retain the existing ones, adding a comment about thread adjustments but leaving the

Thought for 54s

Answer

好,那我直接把“三轮比对”封装成一个函数,然后对 原始 FASTQChr1/Chr2/Chr3 的 SV FASTQ 都跑三轮。
你以后如果有别的 FASTQ,只要再调用一次 run_three_rounds 就行。

下面是完整脚本(包含 PBS 头)——你可以直接保存为 Col-coty-3round.pbs 然后 qsub

bash
#!/bin/bash #PBS -N Col-coty-3round #PBS -l nodes=node03:ppn=10 #PBS -V #PBS -j oe #PBS -l mem=10g #PBS -l walltime=399:59:00 cd "$PBS_O_WORKDIR" set -euo pipefail ############################################################################### # 通用参数 ############################################################################### # 线程数(注意根据实际申请的 ppn 调整) MM2_THREADS=36 ST_THREADS=36 # 软件绝对路径 MINIMAP2="/DataZhai/wangzh/software/minimap2-2.30_x64-linux/minimap2" SAMTOOLS="/public/Zhaijx/wangz/Condaenvir/envs/samtools/bin/samtools" SEQKIT="/public/Zhaijx/wangz/Condaenvir/envs/seqkit/bin/seqkit" # 参考基因组(Col-CC-V2) REF="/DataZhai/wangzh/genome/Col-CC-V2/Col-CC-V2.fasta" # 输出目录 OUTDIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2" mkdir -p "${OUTDIR}" ############################################################################### # 封装:对任意一个 FASTQ 做三轮比对 # 入参1:IN_FASTQ 输入 fastq # 入参2:SAMPLE_TAG 样本标签,用于拼接前缀(不要带路径) ############################################################################### run_three_rounds() { local IN_FASTQ="$1" local SAMPLE_TAG="$2" # 例如:Col-coty-dorado,或者 Col-coty-dorado-Chr1-SV # 所有输出都放在 OUTDIR 下,以 SAMPLE_TAG-Col-CC-V2 为前缀 local BASE="${OUTDIR}/${SAMPLE_TAG}-Col-CC-V2" echo "===================================================================" echo "=== 三轮比对开始 ===" echo "IN_FASTQ : ${IN_FASTQ}" echo "BASE : ${BASE}" echo "时间 : $(date)" echo "===================================================================" ############################################################################# # Round 1:minimap2 map-ont 对原始 FASTQ ############################################################################# local R1_PREFIX="${BASE}-r1-mapont" local R1_SAM="${R1_PREFIX}.sam" local R1_BAM="${R1_PREFIX}.bam" local R1_SORTED_BAM="${R1_PREFIX}-sorted.bam" local R1_UNMAPPED_BAM="${R1_PREFIX}-sorted-unmapped.bam" local R1_UNMAPPED_IDS="${R1_PREFIX}-sorted.unmapped_read_ids.txt" local R1_UNMAPPED_IDS_CLEAN="${R1_PREFIX}-sorted.unmapped_read_ids.clean.txt" local R1_UNMAPPED_FASTQ="${R1_PREFIX}.unmapped.fq.gz" echo "[R1] minimap2 map-ont: ${IN_FASTQ}" "${MINIMAP2}" -x map-ont -a --MD --cs=long \ -t "${MM2_THREADS}" \ "${REF}" "${IN_FASTQ}" > "${R1_SAM}" echo "[R1] SAM -> BAM & sort" "${SAMTOOLS}" view -@ "${ST_THREADS}" -b -o "${R1_BAM}" "${R1_SAM}" "${SAMTOOLS}" sort -@ "${ST_THREADS}" -o "${R1_SORTED_BAM}" "${R1_BAM}" "${SAMTOOLS}" index -@ "${ST_THREADS}" "${R1_SORTED_BAM}" echo "[R1] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ "${ST_THREADS}" -b -f 4 "${R1_SORTED_BAM}" -o "${R1_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ "${ST_THREADS}" "${R1_UNMAPPED_BAM}" | awk '{print $1}' > "${R1_UNMAPPED_IDS}" tr -d '\r' < "${R1_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R1_UNMAPPED_IDS_CLEAN}" echo "[R1] 未比对 ID 行数:$(wc -l < "${R1_UNMAPPED_IDS_CLEAN}")" echo "[R1] 使用 seqkit 从原始 FASTQ 抽取未比对 reads" "${SEQKIT}" grep -j "${ST_THREADS}" -f "${R1_UNMAPPED_IDS_CLEAN}" "${IN_FASTQ}" -o "${R1_UNMAPPED_FASTQ}" echo "[R1] R1_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R1_UNMAPPED_FASTQ}" # R1 只删除原始 SAM 和未排序 BAM echo "[R1] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R1_SAM}" "${R1_BAM}" ############################################################################# # Round 2:minimap2 asm20,对 R1 未比对 reads ############################################################################# local R2_PREFIX="${BASE}-r2-asm20" local R2_SAM="${R2_PREFIX}.sam" local R2_BAM="${R2_PREFIX}.bam" local R2_SORTED_BAM="${R2_PREFIX}-sorted.bam" local R2_UNMAPPED_BAM="${R2_PREFIX}-sorted-unmapped.bam" local R2_UNMAPPED_IDS="${R2_PREFIX}-sorted.unmapped_read_ids.txt" local R2_UNMAPPED_IDS_CLEAN="${R2_PREFIX}-sorted.unmapped_read_ids.clean.txt" local R2_UNMAPPED_FASTQ="${R2_PREFIX}.unmapped.fq.gz" echo "[R2] minimap2 asm20: ${R1_UNMAPPED_FASTQ}" "${MINIMAP2}" -x asm20 -a --MD --cs=long \ -t "${MM2_THREADS}" \ "${REF}" "${R1_UNMAPPED_FASTQ}" > "${R2_SAM}" echo "[R2] SAM -> BAM & sort" "${SAMTOOLS}" view -@ "${ST_THREADS}" -b -o "${R2_BAM}" "${R2_SAM}" "${SAMTOOLS}" sort -@ "${ST_THREADS}" -o "${R2_SORTED_BAM}" "${R2_BAM}" "${SAMTOOLS}" index -@ "${ST_THREADS}" "${R2_SORTED_BAM}" echo "[R2] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ "${ST_THREADS}" -b -f 4 "${R2_SORTED_BAM}" -o "${R2_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ "${ST_THREADS}" "${R2_UNMAPPED_BAM}" | awk '{print $1}' > "${R2_UNMAPPED_IDS}" tr -d '\r' < "${R2_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R2_UNMAPPED_IDS_CLEAN}" echo "[R2] 未比对 ID 行数:$(wc -l < "${R2_UNMAPPED_IDS_CLEAN}")" echo "[R2] 使用 seqkit 从 R1_unmapped FASTQ 抽取 R2 未比对 reads" "${SEQKIT}" grep -j "${ST_THREADS}" -f "${R2_UNMAPPED_IDS_CLEAN}" "${R1_UNMAPPED_FASTQ}" -o "${R2_UNMAPPED_FASTQ}" echo "[R2] R2_unmapped FASTQ 统计:" "${SEQKIT}" stats "${R2_UNMAPPED_FASTQ}" echo "[R2] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R2_SAM}" "${R2_BAM}" ############################################################################# # Round 3:plan A(自定义 map-ont),对 R2 未比对 reads ############################################################################# local R3_PREFIX="${BASE}-r3-planA-mapont" local R3_SAM="${R3_PREFIX}.sam" local R3_BAM="${R3_PREFIX}.bam" local R3_SORTED_BAM="${R3_PREFIX}-sorted.bam" local R3_UNMAPPED_BAM="${R3_PREFIX}-sorted-unmapped.bam" local R3_UNMAPPED_IDS="${R3_PREFIX}-sorted.unmapped_read_ids.txt" local R3_UNMAPPED_IDS_CLEAN="${R3_PREFIX}-sorted.unmapped_read_ids.clean.txt" local R3_UNMAPPED_FASTQ="${R3_PREFIX}.unmapped.fq.gz" # 最终三轮都未比对上的 reads echo "[R3][planA] minimap2 map-ont with tuned params: ${R2_UNMAPPED_FASTQ}" "${MINIMAP2}" -x map-ont -a --MD --cs=long \ -k13 -w7 \ -r2k,60k -g8k \ -U40,700 \ -s70 -z600,200 \ -N30 -t "${MM2_THREADS}" \ "${REF}" "${R2_UNMAPPED_FASTQ}" > "${R3_SAM}" echo "[R3] SAM -> BAM & sort" "${SAMTOOLS}" view -@ "${ST_THREADS}" -b -o "${R3_BAM}" "${R3_SAM}" "${SAMTOOLS}" sort -@ "${ST_THREADS}" -o "${R3_SORTED_BAM}" "${R3_BAM}" "${SAMTOOLS}" index -@ "${ST_THREADS}" "${R3_SORTED_BAM}" echo "[R3] 提取未比对 reads 的 ID (flag 4)" "${SAMTOOLS}" view -@ "${ST_THREADS}" -b -f 4 "${R3_SORTED_BAM}" -o "${R3_UNMAPPED_BAM}" "${SAMTOOLS}" view -@ "${ST_THREADS}" "${R3_UNMAPPED_BAM}" | awk '{print $1}' > "${R3_UNMAPPED_IDS}" tr -d '\r' < "${R3_UNMAPPED_IDS}" | awk 'length($0)>0' | LC_ALL=C sort -u > "${R3_UNMAPPED_IDS_CLEAN}" echo "[R3] 未比对 ID 行数:$(wc -l < "${R3_UNMAPPED_IDS_CLEAN}")" echo "[R3] 使用 seqkit 从 R2_unmapped FASTQ 抽取三轮都未比对上的 reads" "${SEQKIT}" grep -j "${ST_THREADS}" -f "${R3_UNMAPPED_IDS_CLEAN}" "${R2_UNMAPPED_FASTQ}" -o "${R3_UNMAPPED_FASTQ}" echo "[R3] 最终未比对 FASTQ:${R3_UNMAPPED_FASTQ}" "${SEQKIT}" stats "${R3_UNMAPPED_FASTQ}" echo "[R3] 删除原始 SAM/BAM,保留 sorted.bam / bai / unmapped.bam" rm -f "${R3_SAM}" "${R3_BAM}" echo "=== 三轮比对完成(map-ont → asm20 → planA-mapont)===" echo "保留文件(每一轮):" echo " * ${R1_SORTED_BAM} 及其 .bai" echo " * ${R1_UNMAPPED_BAM}" echo " * ${R2_SORTED_BAM} 及其 .bai" echo " * ${R2_UNMAPPED_BAM}" echo " * ${R3_SORTED_BAM} 及其 .bai" echo " * ${R3_UNMAPPED_BAM}" echo "===================================================================" } ############################################################################### # 1)原始大 FASTQ(三轮比对) ############################################################################### RAW_FASTQ="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/fastq/Col-cotyledon-dor-pass-fail-sup.fastq" run_three_rounds "${RAW_FASTQ}" "Col-coty-dorado" ############################################################################### # 2)Chr1 / Chr2 / Chr3 的 SV FASTQ,也各自三轮比对 ############################################################################### SV_IN_DIR="/DataZhai/wangzh/2024_11_ATXR5/fastq/long/2024-11-dorado-ultra/2024-11-25-Zhai/Col-0/minimap2-Col-CC-V2/SV-test-fastq" for chr in Chr1 Chr2 Chr3; do IN_FASTQ="${SV_IN_DIR}/Col-coty-dorado-minimap2-Col-CC-V2-${chr}-SV.fastq" # 给每个子集一个不同的 tag,方便区分 SAMPLE_TAG="Col-coty-dorado-${chr}-SV" run_three_rounds "${IN_FASTQ}" "${SAMPLE_TAG}" done echo "=== 全部样本(三轮比对)已完成:原始 FASTQ + Chr1/Chr2/Chr3 SV ==="

怎么用 / 怎么扩展:

  • 如果以后你有新的 FASTQ,比如 new-sample.fastq,只需要在脚本末尾再加一行:

    bash
    run_three_rounds "/path/to/new-sample.fastq" "NewSampleName"
  • 脚本会自动:

    1. 每个 FASTQ 做 3 轮 minimap2(map-ont → asm20 → planA-mapont)
    2. 每轮都生成 sorted.bam + .bai,并保留未比对 reads 的 BAM 和 FASTQ
    3. 中间的 .sam 和未排序 .bam 会自动删掉,节省空间

如果你想把 某一类(比如只 Chr1/Chr2/Chr3) 单独拿出来跑,也可以删掉/注释掉不需要的 run_three_rounds 调用即可。

Share this Q&A