导入知识
导入知识接口调用流程说明:
- 导入文件类型的知识:需要先创建知识导入任务,然后将本地文件上传,上传完之后启动知识导入任务,以下是三个步骤的说明文档: 创建知识导入任务 上传文件 启动知识导入任务
- 导入URL类型的知识:只需创建知识导入任务
导入本地知识shell脚本示例
1)创建知识导入任务
请求创建知识导入任务, 需要传入dataset_id
和data_source_type=1
, 详细参数说明参考创建知识导入任务
curl 'https://aidmp.cn-sh-01.sensecoreapi.cn/studio/rag/data/v1/datasets/rag_f269a87d61de42508a4c8d3ed0095e56/jobs' \
-H 'authorization: Bearer eyJhbGciOiJSU****'
--data-raw '{
"dataset_id":"rag_f269a87d61de42508a4c8d3ed0095e56",
"data_source_type":1
}'
响应示例
{
"name": "",
"job_id": "b1d6104abf6b46288fd66439dd6cdbab",
"data_source_type": 1,
"job_state": 1,
"creator": "jcao07",
"oss_temp_path": "",
"err_msg": "",
"job_info": {
"total_document_size": "0",
"total_document_count": "0",
"succeed_document_size": "0",
"succeed_document_count": "0",
"succeed_token_count": "0",
"failed_document_size": "0",
"failed_document_count": "0"
},
"document_info": [],
"create_time": null,
"finish_time": null,
"target_path": "",
"urls": [],
"document_pid": "",
"notion_page_ids": [],
"segment_strategy": null
}
2)上传文件
上传文件,根据创建知识导入任务获取到的job_id
, 获取上传文件的预签名URL, 并将本地文件上传,详细说明参考上传文件
curl 'https://aidmp.cn-sh-01.sensecoreapi.cn/studio/rag/data/v1/jobs/b1d6104abf6b46288fd66439dd6cdbab/files:batchPresign' \
-H 'authorization: Bearer eyJhbGciOiJSUz***'\
--data-raw '{"job_id":"b1d6104abf6b46288fd66439dd6cdbab","rel_path":["base/api/README.md", "test.sh"]}'
其中rel_path
是文件上传到知识库中的目标路径
响应示例
{
"result": {
"base/api/README.md": "https://aoss.cn-sh-01.sensecoreapi-oss.cn/rag-system/kn/datasets/rag_f269a87d61de42508a4c8d3ed0095e56/jobs/b1d6104abf6b46288fd66439dd6cdbab/base/api/README.md?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=7A7C633FEA733228360F10AEC8B9FBF3%2F20250102%2Fdefault%2Fs3%2Faws4_request&X-Amz-Date=20250102T094849Z&X-Amz-Expires=7200&X-Amz-SignedHeaders=host&X-Amz-Signature=917067634b90308340bf109faf8faff6c13f889f5505a4a938f4f7c77a329a8c",
"test.sh": "https://aoss.cn-sh-01.sensecoreapi-oss.cn/rag-system/kn/datasets/rag_f269a87d61de42508a4c8d3ed0095e56/jobs/b1d6104abf6b46288fd66439dd6cdbab/test.sh?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=7A7C633FEA733228360F10AEC8B9FBF3%2F20250102%2Fdefault%2Fs3%2Faws4_request&X-Amz-Date=20250102T094849Z&X-Amz-Expires=7200&X-Amz-SignedHeaders=host&X-Amz-Signature=8540f54a85f224614c392fc901c121db23ca4bef49b7ca11173da803309f2814"
}
}
根据获取的预签名URL依次上传本地文件,其中-T
后面填写上传文件的本地路径。
curl -X PUT -T local/README.md \
"https://aoss.cn-sh-01.sensecoreapi-oss.cn/rag-system/kn/datasets/rag_f269a87d61de42508a4c8d3ed0095e56/jobs/b1d6104abf6b46288fd66439dd6cdbab/base/api/README.md?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=7A7C633FEA733228360F10AEC8B9FBF3%2F20250102%2Fdefault%2Fs3%2Faws4_request&X-Amz-Date=20250102T094849Z&X-Amz-Expires=7200&X-Amz-SignedHeaders=host&X-Amz-Signature=917067634b90308340bf109faf8faff6c13f889f5505a4a938f4f7c77a329a8c"
curl -X PUT -T test.sh \
"https://aoss.cn-sh-01.sensecoreapi-oss.cn/rag-system/kn/datasets/rag_f269a87d61de42508a4c8d3ed0095e56/jobs/b1d6104abf6b46288fd66439dd6cdbab/test.sh?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=7A7C633FEA733228360F10AEC8B9FBF3%2F20250102%2Fdefault%2Fs3%2Faws4_request&X-Amz-Date=20250102T094849Z&X-Amz-Expires=7200&X-Amz-SignedHeaders=host&X-Amz-Signature=8540f54a85f224614c392fc901c121db23ca4bef49b7ca11173da803309f2814"
3)启动知识任务
上传完文件后调用启动知识任务接口,failed_file_count
、failed_file_size
、documents
表示上传失败的文件数量、大小和列表信息,详细说明参考启动知识导入任务
curl 'https://aidmp.cn-sh-01.sensecoreapi.cn/studio/rag/data/v1/datasets/rag_f269a87d61de42508a4c8d3ed0095e56/jobs/b1d6104abf6b46288fd66439dd6cdbab:start' \
-H 'authorization: Bearer eyJhbGciOiJSUz***'\
--data-raw '{
"dataset_id":"rag_f269a87d61de42508a4c8d3ed0095e56",
"job_id":"f05368ce6fec4c26a0f67ca920e2d7e9",
"failed_file_count":0,
"failed_file_size":0,
"documents":[]
}'
导入本地知识python脚本示例
以下代码是一个完整的 Python 脚本示例,用于将本地知识导入指定的知识库。该脚本通过认证生成 HTTP 请求头、创建导入任务、获取预签名 URL、上传文件并启动导入任务,完成整个知识导入流程。
# -*- coding: utf-8 -*-
import hmac
import hashlib
import base64
import requests
from datetime import datetime
import pytz
def gen_auth_header(access_key, secret_key):
"""
生成认证 HTTP 头部,包括 X-Date 和 Authorization。
"""
x_date = datetime.now(pytz.utc).strftime("%a, %d %b %Y %H:%M:%S GMT")
sign_content = "x-date: {}".format(x_date)
signature = base64.b64encode(
hmac.new(secret_key.encode(), sign_content.encode(), hashlib.sha256).digest()
).decode()
auth = (
'hmac accesskey="{access_key}", algorithm="hmac-sha256", '
'headers="x-date", signature="{signature}"'.format(
access_key=access_key, signature=signature
)
)
return {"X-Date": x_date, "Authorization": auth}
def create_job(base_url, dataset_id, headers):
"""
创建知识导入任务。
"""
url = "{}/datasets/{}/jobs".format(base_url, dataset_id)
response = requests.post(url, json={"dataset_id": dataset_id, "data_source_type": 1}, headers=headers)
response.raise_for_status()
return response.json()
def get_presigned_urls(base_url, job_id, headers, rel_paths):
"""
获取文件的预签名 URL。
"""
url = "{}/jobs/{}/files:batchPresign".format(base_url, job_id)
response = requests.post(url, json={"job_id": job_id, "rel_path": rel_paths}, headers=headers)
response.raise_for_status()
return response.json()["result"]
def upload_file(file_path, presigned_url):
"""
上传文件到指定的预签名 URL。
"""
with open(file_path, "rb") as f:
response = requests.put(presigned_url, data=f)
response.raise_for_status()
def start_job(base_url, dataset_id, job_id, headers):
"""
启动知识导入任务。
"""
url = "{}/datasets/{}/jobs/{}:start".format(base_url, dataset_id, job_id)
payload = {
"dataset_id": dataset_id,
"job_id": job_id,
"failed_file_count": 0,
"failed_file_size": 0,
"documents": [],
}
response = requests.post(url, json=payload, headers=headers)
response.raise_for_status()
return response.json()
def main():
# 配置信息
access_key = "your_access_key"
secret_key = "your_secret_key"
base_url = "https://aidmp.cn-sh-01.sensecoreapi.cn/studio/rag/data/v1"
dataset_id = "rag_f269a87d61de42508a4c8d3ed0095e56"
loacl_file_paths = ["base/app/test_txt.txt", "test.sh"]
rel_paths = ["base/app/test_txt.txt", "test.sh"]
# 生成认证头部
headers = gen_auth_header(access_key, secret_key)
# 1. 创建知识导入任务
print("创建知识导入任务...")
job_response = create_job(base_url, dataset_id, headers)
job_id = job_response["job_id"]
print("任务已创建,Job ID: {}".format(job_id))
# 2. 获取预签名 URL
print("获取预签名 URL...")
presigned_urls = get_presigned_urls(base_url, job_id, headers, rel_paths)
print("预签名 URL 已获取。")
# 3. 上传文件
print("上传文件...")
for file_path, rel_path in zip(local_file_paths, rel_paths):
print("上传文件 {} 到 {}...".format(local_file_path, rel_path))
upload_file(local_file_path, presigned_urls[rel_path])
print("所有文件已上传。")
# 4. 启动知识导入任务
print("启动知识导入任务...")
start_response = start_job(base_url, dataset_id, job_id, headers)
print("任务已启动,响应: {}".format(start_response))
if __name__ == "__main__":
main()
导入URL知识shell脚本示例
1)创建知识导入任务
请求创建知识导入任务, 需要传入dataset_id
和data_source_type=2
, 详细参数说明参考创建知识导入任务
curl 'https://aidmp.cn-sh-01.sensecoreapi.cn/studio/rag/data/v1/datasets/rag_6a53d6ce2ae74633b2e52361286c53ad/jobs' \
-H 'authorization: Bearer eyJhbGciOiJSUzI1****' \
--data-raw '{
"dataset_id":"rag_6a53d6ce2ae74633b2e52361286c53ad",
"data_source_type":2,
"urls":["https://www.baidu.com/","https://www.nowcoder.com/"]
}'
创建之后的任务进度详情可参考获取知识导入任务详情
响应示例
{
"name": "",
"job_id": "a935710c78ef4857b58b2d1777e665f1",
"data_source_type": 2,
"job_state": 1,
"creator": "aidmpywj",
"oss_temp_path": "",
"err_msg": "",
"job_info": {
"total_document_size": "0",
"total_document_count": "2",
"succeed_document_size": "0",
"succeed_document_count": "0",
"succeed_token_count": "0",
"failed_document_size": "0",
"failed_document_count": "0"
},
"document_info": [],
"create_time": null,
"finish_time": null,
"target_path": "",
"urls": [],
"document_pid": "",
"notion_page_ids": [],
"segment_strategy": null
}