#include <cstdio> #include <cstdlib> #include <cst...

Question

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <vector>
#include <fstream>
#include <algorithm>
#include <cmath>
#include <random>
#include <limits>
#include <android/log.h>
#include "inc/QnnWrapper.h"
#include "inc/FileSystemOP.hpp"
#include "inc/Logger.h"
#include "GetOpt.hpp"

#ifdef SUPPORT_TOKENIZERS
#include "third_party/tokenizers-cpp/include/tokenizers_c.h"
static TokenizerHandle g_tokenizer = nullptr;
#endif

static struct AppConfig {
std::string backendPath;
std::string systemLibraryPath;
std::string cachedBinaryPath;
std::string tokenizerJsonPath;
std::string inputText;
} g_appConfig;

static QnnRuntimeEnv* g_qnnEnv = nullptr;
static QnnSession* g_qnnSession = nullptr;
static SessionInfo* g_sessionInfo = nullptr;
static ZeekrGraphInfo* g_zeekrGraph = nullptr;

static const int g_eosTokenId = 2;
static const float g_temperature = 0.7f;
static const int g_topK = 16;
static const int g_maxNewTokens = 32;
static const float g_repetitionPenalty = 1.0f;
static const int g_modelMaxInputLen = 32;

static std::string trim(const std::string& str) {
auto start = str.find_first_not_of(" \t\n\r");
auto end = str.find_last_not_of(" \t\n\r");
return (start == std::string::npos) ? "" : str.substr(start, end - start + 1);
}

std::vectorstd::string splitString(const std::string& str, const std::string& sep) {
std::vectorstd::string result;
size_t start = 0, end;
while ((end = str.find(sep, start)) != std::string::npos) {
result.push_back(trim(str.substr(start, end - start)));
start = end + sep.length();
}
result.push_back(trim(str.substr(start)));
return result;
}

static double getCurrentRSS() {
FILE* file = fopen("/proc/self/status", "r");
if (!file) return 0.0;

text
char line[128];
double rss_mb = 0.0;
while (fgets(line, sizeof(line), file)) {
    if (strncmp(line, "VmRSS:", 6) == 0) {
        char* num_start = line + 6;
        while (*num_start && !isdigit(*num_start)) num_start++;
        rss_mb = atof(num_start) / 1024.0;
        break;
    }
}
fclose(file);
return rss_mb;

}

static std::string decodeTokens(const std::vector<int>& tokenIds) {
#ifndef SUPPORT_TOKENIZERS
return "";
#else
if (!g_tokenizer) {
__android_log_print(ANDROID_LOG_ERROR, "Tokenizer", "Tokenizer not initialized");
return "";
}
if (tokenIds.empty()) return "";

text
std::vector<uint32_t> data32(tokenIds.begin(), tokenIds.end());

tokenizers_decode(g_tokenizer, data32.data(), data32.size(), 0);

const char* decodedPtr = nullptr;
size_t decodedLen = 0;
tokenizers_get_decode_str(g_tokenizer, &decodedPtr, &decodedLen);
if (!decodedPtr || decodedLen == 0) {
    return "";
}
return std::string(decodedPtr, decodedLen);

#endif
}

static std::vector<float> runInferenceGetLogits(const std::vector<int32_t>& tokens) {
double mem_before = getCurrentRSS();
if (!g_zeekrGraph || g_zeekrGraph->numInputTensors == 0) {
__android_log_print(ANDROID_LOG_ERROR, "QNN", "Model not initialized");
return {};
}

text
ZeekrTensor& inTensor = g_zeekrGraph->input_TensorInfo[0];
size_t copyCount = std::min(tokens.size(), (size_t)g_modelMaxInputLen);
memcpy(inTensor.tensor_data, tokens.data(), copyCount * sizeof(int32_t));
if (copyCount < g_modelMaxInputLen) {
    memset((int32_t*)inTensor.tensor_data + copyCount, 0,
           (g_modelMaxInputLen - copyCount) * sizeof(int32_t));
}

session_Run(g_qnnSession, g_zeekrGraph);

double mem_after = getCurrentRSS();
__android_log_print(ANDROID_LOG_INFO, "Memory", "Inference: %.2f MB", mem_after - mem_before);

std::vector<float> logits;
if (g_zeekrGraph->numOutputTensors > 0) {
    ZeekrTensor& outTensor = g_zeekrGraph->output_TensorInfo[0];
    float* output = reinterpret_cast<float*>(outTensor.tensor_data);
    size_t floatCount = outTensor.data_size / sizeof(float);

    logits.resize(floatCount);
    for (size_t i = 0; i < floatCount; ++i) {
        logits[i] = output[i];
    }
}

return logits;

}

static int sampleNextToken(const std::vector<float>& logits,
const std::vector<int>& currentSequence,
float temperature,
float repetitionPenalty,
int topK) {
if (logits.empty()) {
return 0;
}

text
std::vector<float> modified = logits;

if (std::fabs(temperature) < 1e-9) {
    int maxIdx = -1;
    float maxVal = -1e30f;
    for (int i = 0; i < (int)modified.size(); i++) {
        if (modified[i] > maxVal) {
            maxVal = modified[i];
            maxIdx = i;
        }
    }
    return (maxIdx < 0) ? 0 : maxIdx;
} else {
    for (auto& val : modified) {
        val /= temperature;
    }
}

if (topK > 0 && topK < (int)modified.size()) {
    std::vector<float> tempCopy = modified;
    std::nth_element(tempCopy.begin(), tempCopy.begin()+topK, tempCopy.end(),
                     std::greater<float>());
    float kthVal = tempCopy[topK-1];
    for (auto &v : modified) {
        if (v < kthVal) {
            v = -std::numeric_limits<float>::infinity();
        }
    }
}

std::vector<double> probs(modified.size());
double sumExp = 0.0;
for (size_t i = 0; i < modified.size(); i++) {
    double e = std::exp((double)modified[i]);
    probs[i] = e;
    sumExp += e;
}
if (sumExp < 1e-10) {
    return 0; // all -&infin;
}
for (auto &p : probs) {
    p /= sumExp;
}

static thread_local std::mt19937 rng(std::random_device{}());
std::discrete_distribution<int> dist(probs.begin(), probs.end());
return dist(rng);

}

static std::vector<int> generateSequence(std::vector<int> initTokens) {
std::vector<int> result = initTokens;

text
while (result.size() < g_maxNewTokens) {
    std::vector<int32_t> input32;
    size_t currentLen = result.size();

    if (currentLen > g_modelMaxInputLen) {
        input32.assign(
            result.end() - g_modelMaxInputLen,
            result.end()
        );
        currentLen = g_modelMaxInputLen;
    } else {
        input32 = std::vector<int32_t>(result.begin(), result.end());
    }

    input32.resize(g_modelMaxInputLen, 0); 

    std::vector<float> logits = runInferenceGetLogits(input32);
    if (logits.empty()) break;

    int nextTokenId = sampleNextToken(
        logits, 
        result,
        g_temperature,
        g_repetitionPenalty,
        g_topK
    );

    if (nextTokenId == g_eosTokenId) break;
    result.push_back(nextTokenId);
}

return result;

}

bool initTokenizer() {
double mem_before = getCurrentRSS();
#ifdef SUPPORT_TOKENIZERS
if (g_appConfig.tokenizerJsonPath.empty()) {
__android_log_print(ANDROID_LOG_ERROR, "Tokenizer", "Missing tokenizer.json path");
return false;
}

text
std::ifstream file(g_appConfig.tokenizerJsonPath);
if (!file) {
    __android_log_print(ANDROID_LOG_ERROR, "Tokenizer", "Cannot open tokenizer.json");
    return false;
}

std::string jsonBlob((std::istreambuf_iterator<char>(file)), 
                    std::istreambuf_iterator<char>());
g_tokenizer = tokenizers_new_from_str(jsonBlob.c_str(), jsonBlob.size());
return g_tokenizer != nullptr;
double mem_after = getCurrentRSS();
__android_log_print(ANDROID_LOG_INFO, "Memory", "Tokenizer init: %.2f MB", mem_after - mem_before);

#else
double mem_after = getCurrentRSS();
__android_log_print(ANDROID_LOG_INFO, "Memory", "Tokenizer init: %.2f MB", 0.0);
__android_log_print(ANDROID_LOG_WARN, "Tokenizer", "Tokenizers not supported");
return false;
#endif
}

std::string tokenizeText(const std::string& text) {
#ifdef SUPPORT_TOKENIZERS
if (!g_tokenizer) {
__android_log_print(ANDROID_LOG_ERROR, "Tokenizer", "Tokenizer not initialized");
return "";
}

text
TokenizerEncodeResult result;
tokenizers_encode(g_tokenizer, text.c_str(), text.length(), 0, &result);

std::string tokenStr;
for (size_t i = 0; i < result.len; ++i) {
    tokenStr += std::to_string(result.token_ids[i]);
    if (i != result.len - 1) tokenStr += " ";
}
tokenizers_free_encode_results(&result, 1);

//调试
printf("[DEBUG] Tokenized IDs: ");
for (size_t i = 0; i < result.len; ++i) {
    printf("%d ", result.token_ids[i]); // 打印原始token ID
}
printf("\n");
//调试

return tokenStr;

#else
return "Tokenizers not supported";
#endif
}

bool initQnnModel() {
double mem_before = getCurrentRSS();
std::string new_path = "/system/app/pp_test/lxg/lib/aarch64-android/";
const char* current_path = getenv("LD_LIBRARY_PATH");
if (current_path) new_path += ":" + std::string(current_path);
setenv("LD_LIBRARY_PATH", new_path.c_str(), 1);

text
g_qnnEnv = createQnnRuntimeEnv(g_appConfig.backendPath.c_str(), 
                               g_appConfig.systemLibraryPath.c_str());
if (!g_qnnEnv) {
    __android_log_print(ANDROID_LOG_ERROR, "QNN", "createQnnRuntimeEnv failed");
    return false;
}

g_qnnSession = createQnnRuntimeSession(g_appConfig.cachedBinaryPath.c_str(), g_qnnEnv);
if (!g_qnnSession) {
    __android_log_print(ANDROID_LOG_ERROR, "QNN", "createQnnRuntimeSession failed");
    return false;
}

g_sessionInfo = getSessionInfo(g_qnnSession);
if (!g_sessionInfo) {
    __android_log_print(ANDROID_LOG_ERROR, "QNN", "getSessionInfo failed");
    return false;
}

g_zeekrGraph = &g_sessionInfo->graphInfo[0];
print_sessionInfo(g_sessionInfo);
double mem_after = getCurrentRSS();
__android_log_print(ANDROID_LOG_INFO, "Memory", "QNN init: %.2f MB", mem_after - mem_before);
return true;

}

void runInference(const std::vector<int32_t>& tokens) {
if (!g_zeekrGraph || g_zeekrGraph->numInputTensors == 0) {
__android_log_print(ANDROID_LOG_ERROR, "QNN", "Model not initialized");
return;
}

text
ZeekrTensor& inTensor = g_zeekrGraph->input_TensorInfo[0];
size_t copyCount = std::min(tokens.size(), (size_t)g_modelMaxInputLen);
memcpy(inTensor.tensor_data, tokens.data(), copyCount * sizeof(int32_t));
if (copyCount < g_modelMaxInputLen) {
    memset((int32_t*)inTensor.tensor_data + copyCount, 0, 
          (g_modelMaxInputLen - copyCount) * sizeof(int32_t));
}

session_Run(g_qnnSession, g_zeekrGraph);

if (g_zeekrGraph->numOutputTensors > 0) {
    ZeekrTensor& outTensor = g_zeekrGraph->output_TensorInfo[0];
    float* output = reinterpret_cast<float*>(outTensor.tensor_data);
    printf("Inference Result: ");
    for (size_t i = 0; i < outTensor.data_size / sizeof(float); ++i) {
        printf("%.4f ", output[i]);
    }
    printf("\n");
}

}

void showHelp() {
printf("Usage: ./qnn_sample [OPTIONS]\n");
printf("Options:\n");
printf(" --tokenize <TEXT> Tokenize input text\n");
printf(" --init Initialize model and tokenizer\n");
printf(" --inference <TEXT> Run inference with text\n");
printf(" --generate <TEXT> Run generation with text (top_k/temperature sampling)\n"); // [ADDED]
printf("Required for initialization:\n");
printf(" --backend <PATH> QNN backend library path\n");
printf(" --systemlib <PATH> QNN system library path\n");
printf(" --cached <PATH> Cached model binary path\n");
printf(" --tokenizer <PATH> Tokenizer JSON file path\n");
}

void parseArgs(int argc, char** argv) {
static const pal::Option long_options[] = {
{"tokenize", pal::required_argument, 0, 't'},
{"init", pal::no_argument, 0, 'i'},
{"inference", pal::required_argument, 0, 'f'},
{"generate", pal::required_argument, 0, 'g'}, // [ADDED]
{"backend", pal::required_argument, 0, 'b'},
{"systemlib", pal::required_argument, 0, 's'},
{"cached", pal::required_argument, 0, 'c'},
{"tokenizer", pal::required_argument, 0, 'j'},
{"help", pal::no_argument, 0, 'h'},
{0, 0, 0, 0}
};

text
int opt, option_index = 0;
while ((opt = pal::getOptLongOnly(argc, (const char**)argv, "t:if:g:b:s:c:j:h", long_options, &option_index)) != -1) {
    switch (opt) {
        case 't':
            g_appConfig.inputText = pal::g_optArg;
            break;
        case 'i':
            break;
        case 'f':
            g_appConfig.inputText = pal::g_optArg;
            break;
        case 'g':
            if (pal::g_optArg) {
                g_appConfig.inputText = pal::g_optArg;
            }
            break;
        case 'b':
            g_appConfig.backendPath = pal::g_optArg;
            break;
        case 's':
            g_appConfig.systemLibraryPath = pal::g_optArg;
            break;
        case 'c':
            g_appConfig.cachedBinaryPath = pal::g_optArg;
            break;
        case 'j':
            g_appConfig.tokenizerJsonPath = pal::g_optArg;
            break;
        case 'h':
            showHelp();
            exit(0);
        default:
            showHelp();
            exit(1);
    }
}

}

int main(int argc, char** argv) {
parseArgs(argc, argv);

text
if (argc >= 2 && strcmp(argv[1], "--tokenize") == 0) {
    if (!g_appConfig.inputText.empty()) {
        if (initTokenizer()) {
            std::string tokens = tokenizeText(g_appConfig.inputText);
            printf("Tokenize Result:\nInput: \"%s\"\nTokens: \"%s\"\n", 
                  g_appConfig.inputText.c_str(), tokens.c_str());
        }
    }
    return 0;
}

if (argc >= 2 && strcmp(argv[1], "--init") == 0) {
    bool tokenizerOK = initTokenizer();
    bool qnnOK = initQnnModel();
    printf("Initialization Status:\nTokenizer: %s\nQNN Model: %s\n",
          tokenizerOK ? "Success" : "Failed",
          qnnOK ? "Success" : "Failed");
    return 0;
}

if (argc >= 2 && strcmp(argv[1], "--inference") == 0) {
	std::ifstream fin("input.txt");
	if (!fin.is_open()) {
		printf("Cannot open input.txt");
		return 1;
	}
	std::string line;
	std::getline(fin, line);
	g_appConfig.inputText = line;
	fin.close();

	if (g_appConfig.inputText.empty()) {
		printf("Input from input.txt is empty");
		return 1;
	}
	if (!initTokenizer() || !initQnnModel()) {
		printf("Initialization failed, cannot run inference");
		return 1;
	}

	std::string tokenStr = tokenizeText(g_appConfig.inputText);
	std::vector<std::string> tokenList = splitString(tokenStr, " ");
	std::vector<int32_t> tokens;
	for (const auto& t : tokenList) {
		tokens.push_back(std::stoll(t));
	}
	runInference(tokens);
	return 0;
}

if (argc >= 2 && strcmp(argv[1], "--generate") == 0) {
	std::ifstream fin("input.txt");
	if (!fin.is_open()) {
		printf("Cannot open input.txt\n");
		return 1;
	}
	std::string line;
	std::getline(fin, line);
	g_appConfig.inputText = line;
	fin.close();

	if (g_appConfig.inputText.empty()) {
		printf("Input from input.txt is empty\n");
		return 1;
	}

	if (!initTokenizer() || !initQnnModel()) {
		printf("Initialization failed, cannot run generation\n");
		return 1;
	}

	std::string tokenStr = tokenizeText(g_appConfig.inputText);
	std::vector<std::string> tokenList = splitString(tokenStr, " ");
	std::vector<int> initTokens;
	for (const auto& t : tokenList) {
		initTokens.push_back(std::stoi(t));
	}

	std::vector<int> generatedIds = generateSequence(initTokens);

	std::string finalText = decodeTokens(generatedIds);
	printf("Generated text: %s\n", finalText.c_str());
	return 0;
}

showHelp();
return 1;

}
以上代码编译运行之后，输入的<s> user\n2D上北下南显示</s> \n<s> assistant\n 输出之后变成<s> user\n2D上北下南显示</s> \n<s> assistant\n]]ro 为什么会这样？在我的tokenizer.json中，<s>是开始符号，值为1，</s>是结束符号，值为2

Answer 1

根据你提供的代码以及描述的情况，这个问题可能是由于模型或tokenizer在处理输入时对token的解析或生成的过程有误。特别是输入文本 <s> user\n2D上北下南显示</s> \n<s> assistant\n 被错误地处理为 <s> user\n2D上北下南显示</s> \n<s> assistant\n]]ro，可能的原因如下：

1. Tokenization的错误

在 tokenizeText 函数中，输入文本会先经过tokenizer进行处理。你提到 <s> 是开始符号，其对应的token值为1，</s> 是结束符号，对应的token值为2。但是，如果tokenizer的初始化或者编码（encoding）过程中，token的边界或特殊字符（如 <s> 和 </s>）没有被正确解析，就可能导致误解析或者token id错误。
可能的问题是，在对文本进行tokenization时，tokenizer没有正确分割标记符号（如 <s> 和 </s>），导致在分词后生成了错误的token id。

2. 模型的解码问题

在模型生成文本时，可能存在对特殊标记的处理问题，尤其是在生成时，模型可能会错误地将某些token（如 <s> 和 </s>）误处理为其他字符或token。这可能是由于模型没有正确学习到这些特殊符号的语境，或者这些符号在训练过程中并没有被正确定义。

3. 生成过程中的token误匹配

代码中的 generateSequence 函数用于基于给定的token进行生成，但在生成过程中，如果tokens没有正确传递或处理，或者生成模型输出的token对应的ID没有正确解码，可能会导致一些意外的字符或标记（如 ]]ro）作为输出。

解决方案

1. 检查Tokenizer初始化

确保你提供的 tokenizer.json 文件正确地包含了所有必要的特殊符号（如 <s> 和 </s>）以及其对应的ID，并且tokenizer在初始化时没有出错。你可以尝试打印tokenization的结果，确认输入文本是否被正确处理。

你可以通过在 tokenizeText 中加入调试信息来检查tokenizer是否正确解析了 <s> 和 </s>：

cpp
std::string tokenStr = tokenizeText(g_appConfig.inputText);
printf("[DEBUG] Tokenized IDs: %s\n", tokenStr.c_str());

如果tokenizer的token化规则有误，可以尝试调整tokenizer配置文件或验证tokenizer初始化过程。

2. 检查模型的输出和解码

你提到 <s> 是开始符号，</s> 是结束符号。如果模型在生成过程中意外地生成了无效的token（如 ]]ro），请检查模型的输出层和解码过程，确保在模型生成文本时，不会生成非法的和分析生成的token id和解码后的文本，确保生成的tokens在整个生成过程中被正确转换回字符串。
```
cpp
std::vector<int> generatedIds = generateSequence(initTokens);
std::string finalText = decodeTokens(generatedIds);
printf("Generated text: %s\n", finalText.c_str());
```
检查 decodeTokens 函数的实现，确保token id可以正确转换回原始文本。

3. 调整生成过程中的sampling策略

你在 generateSequence 中使用了温度（temperature）、重复惩罚（repetitionPenalty）和top-K采样（topK）。如果这些值设置不当，可能导致模型生成不合理的token。可以尝试调整这些参数，以避免生成不合逻辑的结果。
例如，增加 temperature 或调整 topK 的值，看看是否能够改善生成结果。

调试建议

验证tokenizer: 打印tokenized的tokens以及解码后的文本，确保 <s> 和 </s> 正确转换。
打印logits: 在生成过程中打印logits，查看模型的输出分布，了解为何会产生不合适的tokens。
确保模型配置正确: 确认模型输入和输出的格式是否与tokenizer匹配，以及模型是否正确训练了对这些特殊符号的理解。

通过以上步骤，你可以定位问题的根本原因，并进行相应的调整。