diff --git a/.github/code_spell_ignore.txt b/.github/code_spell_ignore.txt index e69de29bb2..4566d4f3a2 100644 --- a/.github/code_spell_ignore.txt +++ b/.github/code_spell_ignore.txt @@ -0,0 +1,2 @@ +ModelIn +modelin diff --git a/.github/license_template.txt b/.github/license_template.txt index b43bb9dc80..a0410374d8 100644 --- a/.github/license_template.txt +++ b/.github/license_template.txt @@ -1,2 +1,2 @@ Copyright (C) 2024 Intel Corporation -SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +SPDX-License-Identifier: Apache-2.0 diff --git a/.github/workflows/_example-workflow.yml b/.github/workflows/_example-workflow.yml index 07e857d61b..9b50d93490 100644 --- a/.github/workflows/_example-workflow.yml +++ b/.github/workflows/_example-workflow.yml @@ -40,6 +40,11 @@ on: default: "main" required: false type: string + inject_commit: + default: false + required: false + type: string + jobs: #################################################################################################### # Image Build @@ -72,6 +77,10 @@ jobs: git clone https://github.com/vllm-project/vllm.git cd vllm && git rev-parse HEAD && cd ../ fi + if [[ $(grep -c "vllm-hpu:" ${docker_compose_path}) != 0 ]]; then + git clone https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork && git rev-parse HEAD && cd ../ + fi git clone https://github.com/opea-project/GenAIComps.git cd GenAIComps && git checkout ${{ inputs.opea_branch }} && git rev-parse HEAD && cd ../ @@ -83,6 +92,7 @@ jobs: docker_compose_path: ${{ github.workspace }}/${{ inputs.example }}/docker_image_build/build.yaml service_list: ${{ inputs.services }} registry: ${OPEA_IMAGE_REPO}opea + inject_commit: ${{ inputs.inject_commit }} tag: ${{ inputs.tag }} #################################################################################################### diff --git a/.github/workflows/_manifest-e2e.yml b/.github/workflows/_manifest-e2e.yml index 69a080506d..fc414490da 100644 --- a/.github/workflows/_manifest-e2e.yml +++ b/.github/workflows/_manifest-e2e.yml @@ -90,10 +90,16 @@ jobs: echo "Validate ${{ inputs.example }} successful!" else echo "Validate ${{ inputs.example }} failure!!!" - .github/workflows/scripts/k8s-utils.sh dump_all_pod_logs $NAMESPACE + echo "Check the logs in 'Dump logs when e2e test failed' step!!!" + exit 1 fi fi + - name: Dump logs when e2e test failed + if: failure() + run: | + .github/workflows/scripts/k8s-utils.sh dump_all_pod_logs $NAMESPACE + - name: Kubectl uninstall if: always() run: | diff --git a/.github/workflows/_run-docker-compose.yml b/.github/workflows/_run-docker-compose.yml index fe86a60392..60bf70dcb8 100644 --- a/.github/workflows/_run-docker-compose.yml +++ b/.github/workflows/_run-docker-compose.yml @@ -141,7 +141,11 @@ jobs: flag=${flag#test_} yaml_file=$(find . -type f -wholename "*${{ inputs.hardware }}/${flag}.yaml") echo $yaml_file - docker compose -f $yaml_file stop && docker compose -f $yaml_file rm -f || true + container_list=$(cat $yaml_file | grep container_name | cut -d':' -f2) + for container_name in $container_list; do + cid=$(docker ps -aq --filter "name=$container_name") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + done docker system prune -f docker rmi $(docker images --filter reference="*:5000/*/*" -q) || true diff --git a/.github/workflows/check-online-doc-build.yml b/.github/workflows/check-online-doc-build.yml new file mode 100644 index 0000000000..4972f398dc --- /dev/null +++ b/.github/workflows/check-online-doc-build.yml @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Check Online Document Building +permissions: {} + +on: + pull_request: + branches: [main] + paths: + - "**.md" + - "**.rst" + +jobs: + build: + runs-on: ubuntu-latest + steps: + + - name: Checkout + uses: actions/checkout@v4 + with: + path: GenAIExamples + + - name: Checkout docs + uses: actions/checkout@v4 + with: + repository: opea-project/docs + path: docs + + - name: Build Online Document + shell: bash + run: | + echo "build online doc" + cd docs + bash scripts/build.sh diff --git a/.github/workflows/manual-example-workflow.yml b/.github/workflows/manual-example-workflow.yml index 03ba728c79..9e31f26d78 100644 --- a/.github/workflows/manual-example-workflow.yml +++ b/.github/workflows/manual-example-workflow.yml @@ -50,6 +50,11 @@ on: description: 'OPEA branch for image build' required: false type: string + inject_commit: + default: true + description: "inject commit to docker images true or false" + required: false + type: string permissions: read-all jobs: @@ -101,4 +106,5 @@ jobs: test_k8s: ${{ fromJSON(inputs.test_k8s) }} test_gmc: ${{ fromJSON(inputs.test_gmc) }} opea_branch: ${{ inputs.opea_branch }} + inject_commit: ${{ inputs.inject_commit }} secrets: inherit diff --git a/.github/workflows/manual-image-build.yml b/.github/workflows/manual-image-build.yml index 8a0b0cf2c5..53ba750ed6 100644 --- a/.github/workflows/manual-image-build.yml +++ b/.github/workflows/manual-image-build.yml @@ -30,6 +30,12 @@ on: description: 'OPEA branch for image build' required: false type: string + inject_commit: + default: true + description: "inject commit to docker images true or false" + required: false + type: string + jobs: get-test-matrix: runs-on: ubuntu-latest @@ -56,4 +62,5 @@ jobs: services: ${{ inputs.services }} tag: ${{ inputs.tag }} opea_branch: ${{ inputs.opea_branch }} + inject_commit: ${{ inputs.inject_commit }} secrets: inherit diff --git a/.github/workflows/nightly-docker-build-publish.yml b/.github/workflows/nightly-docker-build-publish.yml new file mode 100644 index 0000000000..d30562224f --- /dev/null +++ b/.github/workflows/nightly-docker-build-publish.yml @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +name: Nightly build/publish latest docker images + +on: + schedule: + - cron: "30 13 * * *" # UTC time + workflow_dispatch: + +env: + EXAMPLES: "AgentQnA,AudioQnA,ChatQnA,CodeGen,CodeTrans,DocIndexRetriever,DocSum,FaqGen,InstructionTuning,MultimodalQnA,ProductivitySuite,RerankFinetuning,SearchQnA,Translation,VideoQnA,VisualQnA" + TAG: "latest" + PUBLISH_TAGS: "latest" + +jobs: + get-build-matrix: + runs-on: ubuntu-latest + outputs: + examples_json: ${{ steps.get-matrix.outputs.examples_json }} + EXAMPLES: ${{ steps.get-matrix.outputs.EXAMPLES }} + TAG: ${{ steps.get-matrix.outputs.TAG }} + PUBLISH_TAGS: ${{ steps.get-matrix.outputs.PUBLISH_TAGS }} + steps: + - name: Create Matrix + id: get-matrix + run: | + examples=($(echo ${EXAMPLES} | tr ',' ' ')) + examples_json=$(printf '%s\n' "${examples[@]}" | sort -u | jq -R '.' | jq -sc '.') + echo "examples_json=$examples_json" >> $GITHUB_OUTPUT + echo "EXAMPLES=$EXAMPLES" >> $GITHUB_OUTPUT + echo "TAG=$TAG" >> $GITHUB_OUTPUT + echo "PUBLISH_TAGS=$PUBLISH_TAGS" >> $GITHUB_OUTPUT + + build: + needs: get-build-matrix + strategy: + matrix: + example: ${{ fromJSON(needs.get-build-matrix.outputs.examples_json) }} + fail-fast: false + uses: ./.github/workflows/_example-workflow.yml + with: + node: gaudi + example: ${{ matrix.example }} + secrets: inherit + + get-image-list: + needs: get-build-matrix + uses: ./.github/workflows/_get-image-list.yml + with: + examples: ${{ needs.get-build-matrix.outputs.EXAMPLES }} + + publish: + needs: [get-build-matrix, get-image-list, build] + strategy: + matrix: + image: ${{ fromJSON(needs.get-image-list.outputs.matrix) }} + runs-on: "docker-build-gaudi" + steps: + - uses: docker/login-action@v3.2.0 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Image Publish + uses: opea-project/validation/actions/image-publish@main + with: + local_image_ref: ${OPEA_IMAGE_REPO}opea/${{ matrix.image }}:${{ needs.get-build-matrix.outputs.TAG }} + image_name: opea/${{ matrix.image }} + publish_tags: ${{ needs.get-build-matrix.outputs.PUBLISH_TAGS }} diff --git a/.github/workflows/pr-gmc-e2e.yaml b/.github/workflows/pr-gmc-e2e.yaml index c2faf3a617..b0be26e993 100644 --- a/.github/workflows/pr-gmc-e2e.yaml +++ b/.github/workflows/pr-gmc-e2e.yaml @@ -12,7 +12,7 @@ on: - "**/tests/test_gmc**" - "!**.md" - "!**.txt" - - "!**/kubernetes/**/manifests/**" + - "!**/kubernetes/**/manifest/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/pr-path-detection.yml b/.github/workflows/pr-path-detection.yml index cf640cb147..c314bd614d 100644 --- a/.github/workflows/pr-path-detection.yml +++ b/.github/workflows/pr-path-detection.yml @@ -61,14 +61,14 @@ jobs: changed_files="$(git diff --name-status --diff-filter=ARM ${{ github.event.pull_request.base.sha }} ${merged_commit} | awk '/\.md$/ {print $NF}')" if [ -n "$changed_files" ]; then for changed_file in $changed_files; do - echo $changed_file + # echo $changed_file url_lines=$(grep -H -Eo '\]\(http[s]?://[^)]+\)' "$changed_file" | grep -Ev 'GenAIExamples/blob/main') || true if [ -n "$url_lines" ]; then for url_line in $url_lines; do - echo $url_line + # echo $url_line url=$(echo "$url_line"|cut -d '(' -f2 | cut -d ')' -f1|sed 's/\.git$//') path=$(echo "$url_line"|cut -d':' -f1 | cut -d'/' -f2-) - response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url") + response=$(curl -L -s -o /dev/null -w "%{http_code}" "$url")|| true if [ "$response" -ne 200 ]; then echo "**********Validation failed, try again**********" response_retry=$(curl -s -o /dev/null -w "%{http_code}" "$url") diff --git a/.github/workflows/scripts/get_test_matrix.sh b/.github/workflows/scripts/get_test_matrix.sh index ac373f350a..a024617027 100644 --- a/.github/workflows/scripts/get_test_matrix.sh +++ b/.github/workflows/scripts/get_test_matrix.sh @@ -9,12 +9,15 @@ set -e changed_files=$changed_files test_mode=$test_mode run_matrix="{\"include\":[" -hardware_list="xeon gaudi" # current support hardware list examples=$(printf '%s\n' "${changed_files[@]}" | grep '/' | cut -d'/' -f1 | sort -u) for example in ${examples}; do cd $WORKSPACE/$example if [[ ! $(find . -type f | grep ${test_mode}) ]]; then continue; fi + cd tests + ls -l + hardware_list=$(find . -type f -name "test_compose*_on_*.sh" | cut -d/ -f2 | cut -d. -f1 | awk -F'_on_' '{print $2}'| sort -u) + echo "Test supported hardware list = ${hardware_list}" run_hardware="" if [[ $(printf '%s\n' "${changed_files[@]}" | grep ${example} | cut -d'/' -f2 | grep -E '*.py|Dockerfile*|ui|docker_image_build' ) ]]; then diff --git a/.gitignore b/.gitignore index 3a5650d215..8b736f831e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ **/playwright/.cache/ **/test-results/ -__pycache__/ \ No newline at end of file +__pycache__/ diff --git a/.prettierignore b/.prettierignore index 0978a006b9..4ab09a93b7 100644 --- a/.prettierignore +++ b/.prettierignore @@ -1 +1 @@ -**/kubernetes/ \ No newline at end of file +**/kubernetes/ diff --git a/AgentQnA/README.md b/AgentQnA/README.md index e6cfaf7c9f..9c351a856f 100644 --- a/AgentQnA/README.md +++ b/AgentQnA/README.md @@ -81,17 +81,13 @@ flowchart LR 3. Hierarchical agent can further improve performance. Expert worker agents, such as retrieval agent, knowledge graph agent, SQL agent, etc., can provide high-quality output for different aspects of a complex query, and the supervisor agent can aggregate the information together to provide a comprehensive answer. -### Roadmap +## Deployment with docker -- v0.9: Worker agent uses open-source websearch tool (duckduckgo), agents use OpenAI GPT-4o-mini as llm backend. -- v1.0: Worker agent uses OPEA retrieval megaservice as tool. -- v1.0 or later: agents use open-source llm backend. -- v1.1 or later: add safeguards +1. Build agent docker image -## Getting started + Note: this is optional. The docker images will be automatically pulled when running the docker compose commands. This step is only needed if pulling images failed. -1. Build agent docker image
- First, clone the opea GenAIComps repo + First, clone the opea GenAIComps repo. ``` export WORKDIR= @@ -106,35 +102,63 @@ flowchart LR docker build -t opea/agent-langchain:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/agent/langchain/Dockerfile . ``` -2. Launch tool services
- In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs. - - ``` - docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0 - ``` - -3. Set up environment for this example
- First, clone this repo +2. Set up environment for this example
+ First, clone this repo. ``` cd $WORKDIR git clone https://github.com/opea-project/GenAIExamples.git ``` - Second, set up env vars + Second, set up env vars. ``` export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/ - # optional: OPANAI_API_KEY + # for using open-source llms + export HUGGINGFACEHUB_API_TOKEN= + export HF_CACHE_DIR= #so that no need to redownload every time + + # optional: OPANAI_API_KEY if you want to use OpenAI models export OPENAI_API_KEY= ``` -4. Launch agent services
- The configurations of the supervisor agent and the worker agent are defined in the docker-compose yaml file. We currently use openAI GPT-4o-mini as LLM, and we plan to add support for llama3.1-70B-instruct (served by TGI-Gaudi) in a subsequent release. - To use openai llm, run command below. +3. Deploy the retrieval tool (i.e., DocIndexRetriever mega-service) + + First, launch the mega-service. + + ``` + cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool + bash launch_retrieval_tool.sh + ``` + + Then, ingest data into the vector database. Here we provide an example. You can ingest your own data. + + ``` + bash run_ingest_data.sh + ``` + +4. Launch other tools.
+ In this example, we will use some of the mock APIs provided in the Meta CRAG KDD Challenge to demonstrate the benefits of gaining additional context from mock knowledge graphs. + + ``` + docker run -d -p=8080:8000 docker.io/aicrowd/kdd-cup-24-crag-mock-api:v0 + ``` + +5. Launch agent services
+ We provide two options for `llm_engine` of the agents: 1. open-source LLMs, 2. OpenAI models via API calls. + + To use open-source LLMs on Gaudi2, run commands below. + + ``` + cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi + bash launch_tgi_gaudi.sh + bash launch_agent_service_tgi_gaudi.sh + ``` + + To use OpenAI models, run commands below. ``` - cd docker_compose/intel/cpu/xeon + cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/cpu/xeon bash launch_agent_service_openai.sh ``` @@ -143,10 +167,12 @@ flowchart LR First look at logs of the agent docker containers: ``` -docker logs docgrader-agent-endpoint +# worker agent +docker logs rag-agent-endpoint ``` ``` +# supervisor agent docker logs react-agent-endpoint ``` @@ -170,4 +196,4 @@ curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: app ## How to register your own tools with agent -You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/langchain/README.md#5-customize-agent-strategy). +You can take a look at the tools yaml and python files in this example. For more details, please refer to the "Provide your own tools" section in the instructions [here](https://github.com/opea-project/GenAIComps/tree/main/comps/agent/langchain/README.md). diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/README.md b/AgentQnA/docker_compose/intel/cpu/xeon/README.md new file mode 100644 index 0000000000..852a0476c6 --- /dev/null +++ b/AgentQnA/docker_compose/intel/cpu/xeon/README.md @@ -0,0 +1,3 @@ +# Deployment on Xeon + +We deploy the retrieval tool on Xeon. For LLMs, we support OpenAI models via API calls. For instructions on using open-source LLMs, please refer to the deployment guide [here](../../../../README.md). diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml index bac5bbc627..837f2a0871 100644 --- a/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml +++ b/AgentQnA/docker_compose/intel/cpu/xeon/compose_openai.yaml @@ -2,11 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 services: - worker-docgrader-agent: + worker-rag-agent: image: opea/agent-langchain:latest - container_name: docgrader-agent-endpoint + container_name: rag-agent-endpoint volumes: - - ${WORKDIR}/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/ - ${TOOLSET_PATH}:/home/user/tools/ ports: - "9095:9095" @@ -36,8 +35,9 @@ services: supervisor-react-agent: image: opea/agent-langchain:latest container_name: react-agent-endpoint + depends_on: + - worker-rag-agent volumes: - - ${WORKDIR}/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/ - ${TOOLSET_PATH}:/home/user/tools/ ports: - "9090:9090" diff --git a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh index 6c2094cc8e..f35e60fd13 100644 --- a/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh +++ b/AgentQnA/docker_compose/intel/cpu/xeon/launch_agent_service_openai.sh @@ -7,7 +7,7 @@ export recursion_limit_worker=12 export recursion_limit_supervisor=10 export model="gpt-4o-mini-2024-07-18" export temperature=0 -export max_new_tokens=512 +export max_new_tokens=4096 export OPENAI_API_KEY=${OPENAI_API_KEY} export WORKER_AGENT_URL="http://${ip_address}:9095/v1/chat/completions" export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool" diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml index 5200f757e3..6a9d0b4650 100644 --- a/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/AgentQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -2,37 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 services: - tgi-server: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 - container_name: tgi-server - ports: - - "8085:80" - volumes: - - ${HF_CACHE_DIR}:/data - environment: - no_proxy: ${no_proxy} - http_proxy: ${http_proxy} - https_proxy: ${https_proxy} - HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - PT_HPU_ENABLE_LAZY_COLLECTIVES: true - ENABLE_HPU_GRAPH: true - LIMIT_HPU_GRAPH: true - USE_FLASH_ATTENTION: true - FLASH_ATTENTION_RECOMPUTE: true - runtime: habana - cap_add: - - SYS_NICE - ipc: host - command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard ${NUM_SHARDS} - worker-docgrader-agent: + worker-rag-agent: image: opea/agent-langchain:latest - container_name: docgrader-agent-endpoint - depends_on: - - tgi-server + container_name: rag-agent-endpoint volumes: # - ${WORKDIR}/GenAIExamples/AgentQnA/docker_image_build/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/ - ${TOOLSET_PATH}:/home/user/tools/ @@ -41,7 +13,7 @@ services: ipc: host environment: ip_address: ${ip_address} - strategy: rag_agent + strategy: rag_agent_llama recursion_limit: ${recursion_limit_worker} llm_engine: tgi HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} @@ -66,8 +38,7 @@ services: image: opea/agent-langchain:latest container_name: react-agent-endpoint depends_on: - - tgi-server - - worker-docgrader-agent + - worker-rag-agent volumes: # - ${WORKDIR}/GenAIExamples/AgentQnA/docker_image_build/GenAIComps/comps/agent/langchain/:/home/user/comps/agent/langchain/ - ${TOOLSET_PATH}:/home/user/tools/ @@ -76,7 +47,7 @@ services: ipc: host environment: ip_address: ${ip_address} - strategy: react_langgraph + strategy: react_llama recursion_limit: ${recursion_limit_supervisor} llm_engine: tgi HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh index f4154fb229..966a037974 100644 --- a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh +++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_agent_service_tgi_gaudi.sh @@ -15,7 +15,7 @@ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct" export NUM_SHARDS=4 export LLM_ENDPOINT_URL="http://${ip_address}:8085" export temperature=0.01 -export max_new_tokens=512 +export max_new_tokens=4096 # agent related environment variables export TOOLSET_PATH=$WORKDIR/GenAIExamples/AgentQnA/tools/ @@ -27,17 +27,3 @@ export RETRIEVAL_TOOL_URL="http://${ip_address}:8889/v1/retrievaltool" export CRAG_SERVER=http://${ip_address}:8080 docker compose -f compose.yaml up -d - -sleep 5s -echo "Waiting tgi gaudi ready" -n=0 -until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do - docker logs tgi-server &> tgi-gaudi-service.log - n=$((n+1)) - if grep -q Connected tgi-gaudi-service.log; then - break - fi - sleep 5s -done -sleep 5s -echo "Service started successfully" diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh new file mode 100644 index 0000000000..75b2a9c7f4 --- /dev/null +++ b/AgentQnA/docker_compose/intel/hpu/gaudi/launch_tgi_gaudi.sh @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# LLM related environment variables +export HF_CACHE_DIR=${HF_CACHE_DIR} +ls $HF_CACHE_DIR +export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-70B-Instruct" +export NUM_SHARDS=4 + +docker compose -f tgi_gaudi.yaml up -d + +sleep 5s +echo "Waiting tgi gaudi ready" +n=0 +until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do + docker logs tgi-server &> tgi-gaudi-service.log + n=$((n+1)) + if grep -q Connected tgi-gaudi-service.log; then + break + fi + sleep 5s +done +sleep 5s +echo "Service started successfully" diff --git a/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml b/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml new file mode 100644 index 0000000000..59c5671e15 --- /dev/null +++ b/AgentQnA/docker_compose/intel/hpu/gaudi/tgi_gaudi.yaml @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + tgi-server: + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 + container_name: tgi-server + ports: + - "8085:80" + volumes: + - ${HF_CACHE_DIR}:/data + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true + ENABLE_HPU_GRAPH: true + LIMIT_HPU_GRAPH: true + USE_FLASH_ATTENTION: true + FLASH_ATTENTION_RECOMPUTE: true + runtime: habana + cap_add: + - SYS_NICE + ipc: host + command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard ${NUM_SHARDS} diff --git a/AgentQnA/tests/1_build_images.sh b/AgentQnA/tests/step1_build_images.sh similarity index 100% rename from AgentQnA/tests/1_build_images.sh rename to AgentQnA/tests/step1_build_images.sh diff --git a/AgentQnA/tests/2_start_retrieval_tool.sh b/AgentQnA/tests/step2_start_retrieval_tool.sh similarity index 100% rename from AgentQnA/tests/2_start_retrieval_tool.sh rename to AgentQnA/tests/step2_start_retrieval_tool.sh diff --git a/AgentQnA/tests/3_ingest_data_and_validate_retrieval.sh b/AgentQnA/tests/step3_ingest_data_and_validate_retrieval.sh similarity index 100% rename from AgentQnA/tests/3_ingest_data_and_validate_retrieval.sh rename to AgentQnA/tests/step3_ingest_data_and_validate_retrieval.sh diff --git a/AgentQnA/tests/4_launch_and_validate_agent_openai.sh b/AgentQnA/tests/step4_launch_and_validate_agent_openai.sh similarity index 100% rename from AgentQnA/tests/4_launch_and_validate_agent_openai.sh rename to AgentQnA/tests/step4_launch_and_validate_agent_openai.sh diff --git a/AgentQnA/tests/4_launch_and_validate_agent_tgi.sh b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh similarity index 64% rename from AgentQnA/tests/4_launch_and_validate_agent_tgi.sh rename to AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh index f7b36da2a3..fde46e0d5a 100644 --- a/AgentQnA/tests/4_launch_and_validate_agent_tgi.sh +++ b/AgentQnA/tests/step4_launch_and_validate_agent_tgi.sh @@ -17,6 +17,12 @@ if [ ! -d "$HF_CACHE_DIR" ]; then fi ls $HF_CACHE_DIR +function start_tgi(){ + echo "Starting tgi-gaudi server" + cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi + bash launch_tgi_gaudi.sh + +} function start_agent_and_api_server() { echo "Starting CRAG server" @@ -25,6 +31,7 @@ function start_agent_and_api_server() { echo "Starting Agent services" cd $WORKDIR/GenAIExamples/AgentQnA/docker_compose/intel/hpu/gaudi bash launch_agent_service_tgi_gaudi.sh + sleep 10 } function validate() { @@ -43,18 +50,22 @@ function validate() { function validate_agent_service() { echo "----------------Test agent ----------------" - local CONTENT=$(http_proxy="" curl http://${ip_address}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ - "query": "Tell me about Michael Jackson song thriller" - }') - local EXIT_CODE=$(validate "$CONTENT" "Thriller" "react-agent-endpoint") - docker logs docgrader-agent-endpoint + # local CONTENT=$(http_proxy="" curl http://${ip_address}:9095/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ + # "query": "Tell me about Michael Jackson song thriller" + # }') + export agent_port="9095" + local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py) + local EXIT_CODE=$(validate "$CONTENT" "Thriller" "rag-agent-endpoint") + docker logs rag-agent-endpoint if [ "$EXIT_CODE" == "1" ]; then exit 1 fi - local CONTENT=$(http_proxy="" curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ - "query": "Tell me about Michael Jackson song thriller" - }') + # local CONTENT=$(http_proxy="" curl http://${ip_address}:9090/v1/chat/completions -X POST -H "Content-Type: application/json" -d '{ + # "query": "Tell me about Michael Jackson song thriller" + # }') + export agent_port="9090" + local CONTENT=$(python3 $WORKDIR/GenAIExamples/AgentQnA/tests/test.py) local EXIT_CODE=$(validate "$CONTENT" "Thriller" "react-agent-endpoint") docker logs react-agent-endpoint if [ "$EXIT_CODE" == "1" ]; then @@ -64,6 +75,10 @@ function validate_agent_service() { } function main() { + echo "==================== Start TGI ====================" + start_tgi + echo "==================== TGI started ====================" + echo "==================== Start agent ====================" start_agent_and_api_server echo "==================== Agent started ====================" diff --git a/AgentQnA/tests/test.py b/AgentQnA/tests/test.py new file mode 100644 index 0000000000..f0ef934412 --- /dev/null +++ b/AgentQnA/tests/test.py @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import requests + + +def generate_answer_agent_api(url, prompt): + proxies = {"http": ""} + payload = { + "query": prompt, + } + response = requests.post(url, json=payload, proxies=proxies) + answer = response.json()["text"] + return answer + + +if __name__ == "__main__": + ip_address = os.getenv("ip_address", "localhost") + agent_port = os.getenv("agent_port", "9095") + url = f"http://{ip_address}:{agent_port}/v1/chat/completions" + prompt = "Tell me about Michael Jackson song thriller" + answer = generate_answer_agent_api(url, prompt) + print(answer) diff --git a/AgentQnA/tests/test_compose_on_gaudi.sh b/AgentQnA/tests/test_compose_on_gaudi.sh index efe1aeeecd..5f7e899dcf 100644 --- a/AgentQnA/tests/test_compose_on_gaudi.sh +++ b/AgentQnA/tests/test_compose_on_gaudi.sh @@ -19,7 +19,6 @@ function stop_crag() { function stop_agent_docker() { cd $WORKPATH/docker_compose/intel/hpu/gaudi/ - # docker compose -f compose.yaml down container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2) for container_name in $container_list; do cid=$(docker ps -aq --filter "name=$container_name") @@ -28,11 +27,21 @@ function stop_agent_docker() { done } +function stop_tgi(){ + cd $WORKPATH/docker_compose/intel/hpu/gaudi/ + container_list=$(cat tgi_gaudi.yaml | grep container_name | cut -d':' -f2) + for container_name in $container_list; do + cid=$(docker ps -aq --filter "name=$container_name") + echo "Stopping container $container_name" + if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi + done + +} + function stop_retrieval_tool() { echo "Stopping Retrieval tool" local RETRIEVAL_TOOL_PATH=$WORKPATH/../DocIndexRetriever cd $RETRIEVAL_TOOL_PATH/docker_compose/intel/cpu/xeon/ - # docker compose -f compose.yaml down container_list=$(cat compose.yaml | grep container_name | cut -d':' -f2) for container_name in $container_list; do cid=$(docker ps -aq --filter "name=$container_name") @@ -43,25 +52,26 @@ function stop_retrieval_tool() { echo "workpath: $WORKPATH" echo "=================== Stop containers ====================" stop_crag +stop_tgi stop_agent_docker stop_retrieval_tool cd $WORKPATH/tests echo "=================== #1 Building docker images====================" -bash 1_build_images.sh +bash step1_build_images.sh echo "=================== #1 Building docker images completed====================" echo "=================== #2 Start retrieval tool====================" -bash 2_start_retrieval_tool.sh +bash step2_start_retrieval_tool.sh echo "=================== #2 Retrieval tool started====================" echo "=================== #3 Ingest data and validate retrieval====================" -bash 3_ingest_data_and_validate_retrieval.sh +bash step3_ingest_data_and_validate_retrieval.sh echo "=================== #3 Data ingestion and validation completed====================" echo "=================== #4 Start agent and API server====================" -bash 4_launch_and_validate_agent_tgi.sh +bash step4_launch_and_validate_agent_tgi.sh echo "=================== #4 Agent test passed ====================" echo "=================== #5 Stop agent and API server====================" @@ -70,4 +80,6 @@ stop_agent_docker stop_retrieval_tool echo "=================== #5 Agent and API server stopped====================" +echo y | docker system prune + echo "ALL DONE!" diff --git a/AgentQnA/tools/supervisor_agent_tools.yaml b/AgentQnA/tools/supervisor_agent_tools.yaml index 58110e5292..4b53cc9f9f 100644 --- a/AgentQnA/tools/supervisor_agent_tools.yaml +++ b/AgentQnA/tools/supervisor_agent_tools.yaml @@ -25,7 +25,7 @@ get_billboard_rank_date: args_schema: rank: type: int - description: song name + description: the rank of interest, for example 1 for top 1 date: type: str description: date diff --git a/AgentQnA/tools/worker_agent_tools.py b/AgentQnA/tools/worker_agent_tools.py index 1dfdb8409e..fded38ec3a 100644 --- a/AgentQnA/tools/worker_agent_tools.py +++ b/AgentQnA/tools/worker_agent_tools.py @@ -12,16 +12,31 @@ def search_knowledge_base(query: str) -> str: print(url) proxies = {"http": ""} payload = { - "text": query, + "messages": query, } response = requests.post(url, json=payload, proxies=proxies) print(response) - docs = response.json()["documents"] - context = "" - for i, doc in enumerate(docs): - if i == 0: - context = doc - else: - context += "\n" + doc - print(context) - return context + if "documents" in response.json(): + docs = response.json()["documents"] + context = "" + for i, doc in enumerate(docs): + if i == 0: + context = doc + else: + context += "\n" + doc + # print(context) + return context + elif "text" in response.json(): + return response.json()["text"] + elif "reranked_docs" in response.json(): + docs = response.json()["reranked_docs"] + context = "" + for i, doc in enumerate(docs): + if i == 0: + context = doc["text"] + else: + context += "\n" + doc["text"] + # print(context) + return context + else: + return "Error parsing response from the knowledge base." diff --git a/AudioQnA/Dockerfile b/AudioQnA/Dockerfile index e2273d381b..265c9c9b5d 100644 --- a/AudioQnA/Dockerfile +++ b/AudioQnA/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./audioqna.py /home/user/audioqna.py diff --git a/AudioQnA/Dockerfile.multilang b/AudioQnA/Dockerfile.multilang index c62cb04048..ef7c926975 100644 --- a/AudioQnA/Dockerfile.multilang +++ b/AudioQnA/Dockerfile.multilang @@ -18,7 +18,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./audioqna_multilang.py /home/user/audioqna_multilang.py diff --git a/AudioQnA/benchmark/performance/README.md b/AudioQnA/benchmark/performance/README.md new file mode 100644 index 0000000000..3d1bbc1c92 --- /dev/null +++ b/AudioQnA/benchmark/performance/README.md @@ -0,0 +1,77 @@ +# AudioQnA Benchmarking + +This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance. + +By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community. + +## Purpose + +We aim to run these benchmarks and share them with the OPEA community for three primary reasons: + +- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs. +- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case. +- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc. + +## Metrics + +The benchmark will report the below metrics, including: + +- Number of Concurrent Requests +- End-to-End Latency: P50, P90, P99 (in milliseconds) +- End-to-End First Token Latency: P50, P90, P99 (in milliseconds) +- Average Next Token Latency (in milliseconds) +- Average Token Latency (in milliseconds) +- Requests Per Second (RPS) +- Output Tokens Per Second +- Input Tokens Per Second + +Results will be displayed in the terminal and saved as CSV file named `1_stats.csv` for easy export to spreadsheets. + +## Getting Started + +We recommend using Kubernetes to deploy the AudioQnA service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs. + +### Prerequisites + +- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md). + +- Every node has direct internet access +- Set up kubectl on the master node with access to the Kubernetes cluster. +- Install Python 3.8+ on the master node for running GenAIEval. +- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods. +- Ensure that the container's ulimit can meet the the number of requests. + +```bash +# The way to modify the containered ulimit: +sudo systemctl edit containerd +# Add two lines: +[Service] +LimitNOFILE=65536:1048576 + +sudo systemctl daemon-reload; sudo systemctl restart containerd +``` + +### Test Steps + +Please deploy AudioQnA service before benchmarking. + +##### Run Benchmark Test + +Before the benchmark, we can configure the number of test queries and test output directory by: + +```bash +export USER_QUERIES="[128, 128, 128, 128]" +export TEST_OUTPUT_DIR="/tmp/benchmark_output" +``` + +And then run the benchmark by: + +```bash +bash benchmark.sh -n +``` + +The argument `-n` refers to the number of test nodes. + +##### 4. Data collection + +All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps. diff --git a/AudioQnA/benchmark/performance/benchmark.sh b/AudioQnA/benchmark/performance/benchmark.sh new file mode 100644 index 0000000000..2930c7753f --- /dev/null +++ b/AudioQnA/benchmark/performance/benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deployment_type="k8s" +node_number=1 +service_port=8888 +query_per_node=128 + +benchmark_tool_path="$(pwd)/GenAIEval" + +usage() { + echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]" + echo " -d deployment_type AudioQnA deployment type, select between k8s and docker (default: k8s)" + echo " -n node_number Test node number, required only for k8s deployment_type, (default: 1)" + echo " -i service_ip AudioQnA service ip, required only for docker deployment_type" + echo " -p service_port AudioQnA service port, required only for docker deployment_type, (default: 8888)" + exit 1 +} + +while getopts ":d:n:i:p:" opt; do + case ${opt} in + d ) + deployment_type=$OPTARG + ;; + n ) + node_number=$OPTARG + ;; + i ) + service_ip=$OPTARG + ;; + p ) + service_port=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then + echo "Error: service_ip is required for docker deployment_type" 1>&2 + usage +fi + +if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then + echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2 +fi + +function main() { + if [[ ! -d ${benchmark_tool_path} ]]; then + echo "Benchmark tool not found, setting up..." + setup_env + fi + run_benchmark +} + +function setup_env() { + git clone https://github.com/opea-project/GenAIEval.git + pushd ${benchmark_tool_path} + python3 -m venv stress_venv + source stress_venv/bin/activate + pip install -r requirements.txt + popd +} + +function run_benchmark() { + source ${benchmark_tool_path}/stress_venv/bin/activate + export DEPLOYMENT_TYPE=${deployment_type} + export SERVICE_IP=${service_ip:-"None"} + export SERVICE_PORT=${service_port:-"None"} + if [[ -z $USER_QUERIES ]]; then + user_query=$((query_per_node*node_number)) + export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]" + echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}." + fi + export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//') + if [[ -z $WARMUP ]]; then export WARMUP=0; fi + if [[ -z $TEST_OUTPUT_DIR ]]; then + if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}" + else + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker" + fi + echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}." + fi + + envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml + cd ${benchmark_tool_path}/evals/benchmark + python benchmark.py +} + +main diff --git a/AudioQnA/benchmark/performance/benchmark.yaml b/AudioQnA/benchmark/performance/benchmark.yaml new file mode 100644 index 0000000000..659a99a759 --- /dev/null +++ b/AudioQnA/benchmark/performance/benchmark.yaml @@ -0,0 +1,52 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +test_suite_config: # Overall configuration settings for the test suite + examples: ["audioqna"] # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna + deployment_type: "k8s" # Default is "k8s", can also be "docker" + service_ip: None # Leave as None for k8s, specify for Docker + service_port: None # Leave as None for k8s, specify for Docker + warm_ups: 0 # Number of test requests for warm-up + run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs + user_queries: [1, 2, 4, 8, 16, 32, 64, 128] # Number of test requests at each concurrency level + query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. + random_prompt: false # Use random prompts if true, fixed prompts if false + collect_service_metric: false # Collect service metrics if true, do not collect service metrics if false + data_visualization: false # Generate data visualization if true, do not generate data visualization if false + llm_model: "Intel/neural-chat-7b-v3-3" # The LLM model used for the test + test_output_dir: "/tmp/benchmark_output" # The directory to store the test output + load_shape: # Tenant concurrency pattern + name: constant # poisson or constant(locust default load shape) + params: # Loadshape-specific parameters + constant: # Poisson load shape specific parameters, activate only if load_shape is poisson + concurrent_level: 4 # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users + poisson: # Poisson load shape specific parameters, activate only if load_shape is poisson + arrival-rate: 1.0 # Request arrival rate + namespace: "" # Fill the user-defined namespace. Otherwise, it will be default. + +test_cases: + audioqna: + asr: + run_test: true + service_name: "asr-svc" # Replace with your service name + llm: + run_test: true + service_name: "llm-svc" # Replace with your service name + parameters: + model_name: "Intel/neural-chat-7b-v3-3" + max_new_tokens: 128 + temperature: 0.01 + top_k: 10 + top_p: 0.95 + repetition_penalty: 1.03 + streaming: true + llmserve: + run_test: true + service_name: "llm-svc" # Replace with your service name + tts: + run_test: true + service_name: "tts-svc" # Replace with your service name + e2e: + run_test: true + service_name: "audioqna-backend-server-svc" # Replace with your service name diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml index a0ef81d172..ea3c45b919 100644 --- a/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml +++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose.yaml @@ -41,7 +41,7 @@ services: environment: TTS_ENDPOINT: ${TTS_ENDPOINT} tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "3006:80" diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml index d8ca1d7f8c..3e20dbc4af 100644 --- a/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml +++ b/AudioQnA/docker_compose/intel/cpu/xeon/compose_multilang.yaml @@ -26,7 +26,7 @@ services: https_proxy: ${https_proxy} restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "3006:80" diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml index c3f885fcee..b536522c4f 100644 --- a/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/AudioQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -51,7 +51,7 @@ services: environment: TTS_ENDPOINT: ${TTS_ENDPOINT} tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "3006:80" diff --git a/AudioQnA/kubernetes/intel/README.md b/AudioQnA/kubernetes/intel/README.md index 27948ed8b7..07bc6c1a13 100644 --- a/AudioQnA/kubernetes/intel/README.md +++ b/AudioQnA/kubernetes/intel/README.md @@ -7,14 +7,14 @@ ## Deploy On Xeon ``` -cd GenAIExamples/AudioQnA/kubernetes/intel/cpu/xeon/manifests +cd GenAIExamples/AudioQnA/kubernetes/intel/cpu/xeon/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" audioqna.yaml kubectl apply -f audioqna.yaml ``` ## Deploy On Gaudi ``` -cd GenAIExamples/AudioQnA/kubernetes/intel/hpu/gaudi/manifests +cd GenAIExamples/AudioQnA/kubernetes/intel/hpu/gaudi/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" audioqna.yaml kubectl apply -f audioqna.yaml diff --git a/AudioQnA/kubernetes/intel/README_gmc.md b/AudioQnA/kubernetes/intel/README_gmc.md index 30d879e196..767fdf3667 100644 --- a/AudioQnA/kubernetes/intel/README_gmc.md +++ b/AudioQnA/kubernetes/intel/README_gmc.md @@ -25,7 +25,7 @@ The AudioQnA uses the below prebuilt images if you choose a Xeon deployment Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services. For Gaudi: -- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.5 +- tgi-service: ghcr.io/huggingface/tgi-gaudi:2.0.6 - whisper-gaudi: opea/whisper-gaudi:latest - speecht5-gaudi: opea/speecht5-gaudi:latest diff --git a/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml b/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml index bd76774835..6856d2b878 100644 --- a/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml +++ b/AudioQnA/kubernetes/intel/cpu/xeon/manifest/audioqna.yaml @@ -247,7 +247,7 @@ spec: - envFrom: - configMapRef: name: audio-qna-config - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" name: llm-dependency-deploy-demo securityContext: capabilities: diff --git a/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml b/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml index 2d0c567e3a..6659a7811a 100644 --- a/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml +++ b/AudioQnA/kubernetes/intel/hpu/gaudi/manifest/audioqna.yaml @@ -271,7 +271,7 @@ spec: - envFrom: - configMapRef: name: audio-qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 name: llm-dependency-deploy-demo securityContext: capabilities: diff --git a/AudioQnA/tests/test_compose_on_gaudi.sh b/AudioQnA/tests/test_compose_on_gaudi.sh index 69270736d6..e626b2671a 100644 --- a/AudioQnA/tests/test_compose_on_gaudi.sh +++ b/AudioQnA/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="audioqna whisper-gaudi asr llm-tgi speecht5-gaudi tts" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } diff --git a/AudioQnA/tests/test_compose_on_xeon.sh b/AudioQnA/tests/test_compose_on_xeon.sh index b36b5c7de9..926a51a33f 100644 --- a/AudioQnA/tests/test_compose_on_xeon.sh +++ b/AudioQnA/tests/test_compose_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="audioqna whisper asr llm-tgi speecht5 tts" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } diff --git a/AudioQnA/ui/docker/Dockerfile b/AudioQnA/ui/docker/Dockerfile index ac2bb7da31..1d5115f4b5 100644 --- a/AudioQnA/ui/docker/Dockerfile +++ b/AudioQnA/ui/docker/Dockerfile @@ -23,4 +23,4 @@ RUN npm run build EXPOSE 5173 # Run the front-end application in preview mode -CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] \ No newline at end of file +CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] diff --git a/AudioQnA/ui/svelte/src/app.postcss b/AudioQnA/ui/svelte/src/app.postcss index c3e0519c6a..4b957234dc 100644 --- a/AudioQnA/ui/svelte/src/app.postcss +++ b/AudioQnA/ui/svelte/src/app.postcss @@ -79,4 +79,4 @@ a.btn { .w-12\/12 { width: 100% -} \ No newline at end of file +} diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/1.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/1.svg index 38adea6ffc..71ac8d5fcc 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/1.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/1.svg @@ -89,4 +89,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/2.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/2.svg index 0e6150e4ae..95d4056589 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/2.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/2.svg @@ -89,4 +89,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/3.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/3.svg index 3ed7f7fc5b..310d437e5d 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/3.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/3.svg @@ -76,4 +76,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/4.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/4.svg index 2b34e86b01..f3281671de 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/4.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/4.svg @@ -76,4 +76,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/5.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/5.svg index 718f3b304b..8a5864192d 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/5.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/5.svg @@ -89,4 +89,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/stop-recording.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/stop-recording.svg index 4f4e638bab..82e497ab04 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/stop-recording.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/stop-recording.svg @@ -3,4 +3,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/upload.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/upload.svg index 55790f05f7..5264818ebf 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/upload.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/upload.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voice.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voice.svg index 0cc1d520e4..2d1375e1a5 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voice.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voice.svg @@ -6,4 +6,4 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOff.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOff.svg index 8161062a4c..fe9b59ee83 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOff.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOff.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOn.svg b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOn.svg index aeb96fabea..eca1441d15 100644 --- a/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOn.svg +++ b/AudioQnA/ui/svelte/src/lib/assets/icons/svg/voiceOn.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/AvatarChatbot/.gitignore b/AvatarChatbot/.gitignore index 8ad440e683..84dc2308b2 100644 --- a/AvatarChatbot/.gitignore +++ b/AvatarChatbot/.gitignore @@ -4,3 +4,5 @@ *.log docker_compose/intel/cpu/xeon/data docker_compose/intel/hpu/gaudi/data +inputs/ +outputs/ diff --git a/AvatarChatbot/README.md b/AvatarChatbot/README.md index ed0e29e8c1..32b387428a 100644 --- a/AvatarChatbot/README.md +++ b/AvatarChatbot/README.md @@ -75,7 +75,7 @@ The AvatarChatbot service can be deployed on either Intel Gaudi2 AI Accelerator ### Deploy AvatarChatbot on Gaudi -Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) for instructions on deploying AvatarChatbot on Gaudi. +Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) for instructions on deploying AvatarChatbot on Gaudi, and on setting up an UI for the application. ### Deploy AvatarChatbot on Xeon diff --git a/AvatarChatbot/assets/img/UI.png b/AvatarChatbot/assets/img/UI.png new file mode 100644 index 0000000000..c78fe3bea8 Binary files /dev/null and b/AvatarChatbot/assets/img/UI.png differ diff --git a/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md b/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md index 02e64adc92..f0b36c94bb 100644 --- a/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md +++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/README.md @@ -96,9 +96,9 @@ export ANIMATION_SERVICE_PORT=3008 ```bash export DEVICE="cpu" export WAV2LIP_PORT=7860 -export INFERENCE_MODE='wav2lip+gfpgan' +export INFERENCE_MODE='wav2lip_only' export CHECKPOINT_PATH='/usr/local/lib/python3.11/site-packages/Wav2Lip/checkpoints/wav2lip_gan.pth' -export FACE="assets/img/avatar5.png" +export FACE="assets/img/avatar1.jpg" # export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None' export AUDIO='None' export FACESIZE=96 @@ -188,13 +188,16 @@ The output file will be saved in the current working directory, as `${PWD}` is m ## Gradio UI -Follow the instructions in [Build Mega Service of AudioQnA on Gaudi](https://github.com/opea-project/GenAIExamples/blob/main/AudioQnA/docker_compose/intel/hpu/gaudi/README.md) to build necessary Docker images and start the AudioQnA MegaService with the endpoint `http://localhost:3008/v1/audioqna`. Then run the following command to start the Gradio UI: - ```bash -cd GenAIExamples/AvatarChatbot/docker/ui/gradio -python3 app_gradio_demo.py +cd $WORKPATH/GenAIExamples/AvatarChatbot +python3 ui/gradio/app_gradio_demo_avatarchatbot.py ``` +The UI can be viewed at http://${host_ip}:7861 +UI Example +In the current version v1.0, you need to set the avatar figure image/video and the DL model choice in the environment variables before starting AvatarChatbot backend service and running the UI. Please just customize the audio question in the UI. +\*\* We will enable change of avatar figure between runs in v2.0 + ## Troubleshooting ```bash diff --git a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml index aa6f49bf87..2496b11e87 100644 --- a/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml +++ b/AvatarChatbot/docker_compose/intel/cpu/xeon/compose.yaml @@ -42,7 +42,7 @@ services: environment: TTS_ENDPOINT: ${TTS_ENDPOINT} tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "3006:80" diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md index f84d763efb..b35726f63d 100644 --- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md +++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/README.md @@ -96,9 +96,9 @@ export ANIMATION_SERVICE_PORT=3008 ```bash export DEVICE="hpu" export WAV2LIP_PORT=7860 -export INFERENCE_MODE='wav2lip+gfpgan' +export INFERENCE_MODE='wav2lip_only' export CHECKPOINT_PATH='/usr/local/lib/python3.10/dist-packages/Wav2Lip/checkpoints/wav2lip_gan.pth' -export FACE="assets/img/avatar5.png" +export FACE="assets/img/avatar1.jpg" # export AUDIO='assets/audio/eg3_ref.wav' # audio file path is optional, will use base64str in the post request as input if is 'None' export AUDIO='None' export FACESIZE=96 @@ -188,14 +188,25 @@ The output file will be saved in the current working directory, as `${PWD}` is m ## Gradio UI -Follow the instructions in [Build Mega Service of AudioQnA on Gaudi](https://github.com/opea-project/GenAIExamples/blob/main/AudioQnA/docker_compose/intel/hpu/gaudi/README.md) to build necessary Docker images and start the AudioQnA MegaService with the endpoint `http://localhost:3008/v1/audioqna`. Then run the following command to start the Gradio UI: +```bash +sudo apt update +sudo apt install -y yasm pkg-config libx264-dev nasm +cd $WORKPATH +git clone https://github.com/FFmpeg/FFmpeg.git +cd FFmpeg +sudo ./configure --enable-gpl --enable-libx264 && sudo make -j$(nproc-1) && sudo make install && hash -r +pip install gradio==4.38.1 soundfile +``` ```bash -cd GenAIExamples/AvatarChatbot/docker/ui/gradio -python3 app_gradio_demo.py +cd $WORKPATH/GenAIExamples/AvatarChatbot +python3 ui/gradio/app_gradio_demo_avatarchatbot.py ``` -The UI can be viewed at http://${host_ip}:7861 +The UI can be viewed at http://${host_ip}:7861 +UI Example +In the current version v1.0, you need to set the avatar figure image/video and the DL model choice in the environment variables before starting AvatarChatbot backend service and running the UI. Please just customize the audio question in the UI. +\*\* We will enable change of avatar figure between runs in v2.0 ## Troubleshooting diff --git a/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml b/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml index 06a7e4e054..2003bb4a99 100644 --- a/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/AvatarChatbot/docker_compose/intel/hpu/gaudi/compose.yaml @@ -15,7 +15,7 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HABANA_VISIBLE_MODULES: all + HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none runtime: habana cap_add: @@ -39,7 +39,7 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HABANA_VISIBLE_MODULES: all + HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none runtime: habana cap_add: @@ -54,7 +54,7 @@ services: environment: TTS_ENDPOINT: ${TTS_ENDPOINT} tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "3006:80" @@ -67,7 +67,7 @@ services: HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_MODULES: all + HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none ENABLE_HPU_GRAPH: true LIMIT_HPU_GRAPH: true @@ -105,7 +105,7 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HABANA_VISIBLE_MODULES: all + HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none DEVICE: ${DEVICE} INFERENCE_MODE: ${INFERENCE_MODE} @@ -132,7 +132,7 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HABANA_VISIBLE_MODULES: all + HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none WAV2LIP_ENDPOINT: ${WAV2LIP_ENDPOINT} runtime: habana diff --git a/AvatarChatbot/tests/test_compose_on_gaudi.sh b/AvatarChatbot/tests/test_compose_on_gaudi.sh old mode 100644 new mode 100755 index fc56194b0f..aab0e3b68b --- a/AvatarChatbot/tests/test_compose_on_gaudi.sh +++ b/AvatarChatbot/tests/test_compose_on_gaudi.sh @@ -29,7 +29,7 @@ function build_docker_images() { service_list="avatarchatbot whisper-gaudi asr llm-tgi speecht5-gaudi tts wav2lip-gaudi animation" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } @@ -74,7 +74,7 @@ function start_services() { export FPS=10 # Start Docker Containers - docker compose up -d + docker compose up -d > ${LOG_PATH}/start_services_with_compose.log n=0 until [[ "$n" -ge 100 ]]; do @@ -86,7 +86,6 @@ function start_services() { n=$((n+1)) done - # sleep 5m echo "All services are up and running" sleep 5s } @@ -99,6 +98,7 @@ function validate_megaservice() { if [[ $result == *"mp4"* ]]; then echo "Result correct." else + echo "Result wrong, print docker logs." docker logs whisper-service > $LOG_PATH/whisper-service.log docker logs asr-service > $LOG_PATH/asr-service.log docker logs speecht5-service > $LOG_PATH/speecht5-service.log @@ -107,19 +107,13 @@ function validate_megaservice() { docker logs llm-tgi-gaudi-server > $LOG_PATH/llm-tgi-gaudi-server.log docker logs wav2lip-service > $LOG_PATH/wav2lip-service.log docker logs animation-gaudi-server > $LOG_PATH/animation-gaudi-server.log - - echo "Result wrong." + echo "Exit test." exit 1 fi } -#function validate_frontend() { - -#} - - function stop_docker() { cd $WORKPATH/docker_compose/intel/hpu/gaudi docker compose down @@ -127,15 +121,17 @@ function stop_docker() { function main() { - stop_docker + echo y | docker builder prune --all + echo y | docker image prune + if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi start_services # validate_microservices validate_megaservice # validate_frontend - stop_docker + stop_docker echo y | docker builder prune --all echo y | docker image prune diff --git a/AvatarChatbot/tests/test_compose_on_xeon.sh b/AvatarChatbot/tests/test_compose_on_xeon.sh old mode 100644 new mode 100755 index 1b1780a1b4..2bed682cfd --- a/AvatarChatbot/tests/test_compose_on_xeon.sh +++ b/AvatarChatbot/tests/test_compose_on_xeon.sh @@ -29,7 +29,7 @@ function build_docker_images() { service_list="avatarchatbot whisper asr llm-tgi speecht5 tts wav2lip animation" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } diff --git a/AvatarChatbot/ui/gradio/app_gradio_demo.py b/AvatarChatbot/ui/gradio/app_gradio_demo.py deleted file mode 100644 index 9317570e5c..0000000000 --- a/AvatarChatbot/ui/gradio/app_gradio_demo.py +++ /dev/null @@ -1,444 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import asyncio -import base64 -import io -import os -import shutil -import subprocess -import time - -import aiohttp -import gradio as gr -import numpy as np -import requests -import soundfile as sf -from PIL import Image - - -# %% AudioQnA functions -def preprocess_audio(audio): - """The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,)""" - sr, y = audio - # Convert to normalized float32 audio - y = y.astype(np.float32) - y /= np.max(np.abs(y)) - # Convert the normalized float32 audio to a WAV file in memory - buf = io.BytesIO() - sf.write(buf, y, sr, format="WAV") - buf.seek(0) # Reset the buffer position to the beginning - # Encode the WAV file to base64 string - base64_bytes = base64.b64encode(buf.read()) - base64_string = base64_bytes.decode("utf-8") - return base64_string - - -def base64_to_int16(base64_string): - wav_bytes = base64.b64decode(base64_string) - buf = io.BytesIO(wav_bytes) - y, sr = sf.read(buf, dtype="int16") - return sr, y - - -async def transcribe(audio_input): - """Input: mic audio; Output: ai audio, text, text""" - global ai_chatbot_url, chat_history - chat_history = "" - # Preprocess the audio - base64bytestr = preprocess_audio(audio_input) - - # if not audio_choice: - # base64bytestr = preprocess_audio(audio_input) - # else: - # # convert wav file to base64 - # audio_index = int(audio_choice.split(".")[0]) - 1 - # audio_filepath = audio_filepaths[audio_index] - # audio_input.value = audio_filepath - # with open(audio_filepath, "rb") as file: - # base64bytestr = base64.b64encode(file.read()).decode('utf-8') - - # Send the audio to the backend server - initial_inputs = {"audio": base64bytestr, "max_tokens": 64} - - async with aiohttp.ClientSession() as session: - async with session.post(ai_chatbot_url, json=initial_inputs) as response: - # response = requests.post(ai_chatbot_url, json=initial_inputs) - - # Check the response status code - if response.status == 200: - response_json = await response.json() - # with open("response.txt", "w") as file: - # file.write(response) - - # Decode the base64 string - sampling_rate, audio_int16 = base64_to_int16(response_json["byte_str"]) - chat_history += f"User: {response_json['query']}\n\n" - - chat_ai = response_json["text"] - hitted_ends = [",", ".", "?", "!", "。", ";"] - last_punc_idx = max([chat_ai.rfind(punc) for punc in hitted_ends]) - if last_punc_idx != -1: - chat_ai = chat_ai[: last_punc_idx + 1] - chat_history += f"AI: {chat_ai}" - chat_history = chat_history.replace("OPEX", "OPEA") - return (sampling_rate, audio_int16) # handle the response - else: - return {"error": "Failed to transcribe audio", "status_code": response.status_code} - - -def resize_image(image_pil, size=(720, 720)): - """Resize the image to the specified size.""" - return image_pil.resize(size, Image.LANCZOS) - - -def resize_video(video_path, save_path, size=(720, 1280)): - """Resize the video to the specified size.""" - # command_resize_video = f"ffmpeg -y -i {video_path} -vf scale={size[0]}:{size[1]} {save_path}" - # subprocess.run(command_resize_video, shell=True) - - -# %% Wav2Lip functions -async def gen_video(image, audio, model_choice): - """Input: image (saved .png path), ai audio (saved .wav path); Output: video""" - # 0. Preprocess audio - # buf = io.BytesIO() - sr, y = audio - output_audio_save_path = "inputs/intermediate.wav" - sf.write(output_audio_save_path, y, sr, format="WAV") - - # 1. Set environment variables - match model_choice: - case "wav2lip": - os.environ["INFERENCE_MODE"] = "wav2lip_only" - os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip.pth" - case "wav2lip+GAN": - os.environ["INFERENCE_MODE"] = "wav2lip_only" - os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip_gan.pth" - case "wav2lip+GFPGAN": - os.environ["INFERENCE_MODE"] = "wav2lip+gfpgan" - os.environ["CHECKPOINT_PATH"] = "Wav2Lip/checkpoints/wav2lip.pth" - - # os.environ['INFERENCE_MODE'] = 'wav2lip_only' - # os.environ['CHECKPOINT_PATH'] = 'Wav2Lip/checkpoints/wav2lip_gan.pth' - os.environ["FACE"] = image # path to either an image or a video - os.environ["AUDIO"] = output_audio_save_path # path to .wav audio - # os.environ['AUDIO'] = audio - os.environ["FACESIZE"] = "96" - os.environ["OUTFILE"] = "outputs/result6.mp4" - os.environ["GFPGAN_MODEL_VERSION"] = "1.3" - os.environ["UPSCALE_FACTOR"] = "1" # int - # os.environ['FPS'] = '25.' # can be lower (e.g., 10) - os.environ["FPS"] = "10." # can be lower when using an image (e.g., 10) - - # 2. Run inference.sh bash script to perform Wav2Lip+GFPGAN inference - # Output video is saved at the path 'OUTFILE' - # command_wav2lip_gfpgan = "bash inference_vars.sh" - # subprocess.run(command_wav2lip_gfpgan, shell=True) - - outfile = os.environ.get("OUTFILE") - if os.path.exists(outfile): - res_video = outfile - else: - res_video = "inputs/loading.mp4" - return res_video - - -# %% AI Avatar demo function -# ctao 7/19 - make it asynchronous -async def aiavatar_demo(audio_input): - """Input: mic audio, image; Output: ai audio, text, text, ai video""" - # Include AudioQnA - output_audio = await transcribe(audio_input) # AudioQnA - - if isinstance(output_audio, dict): # in case of an error - return None, None - else: - sr, audio_int16 = output_audio - audio_file = "outputs/output_audio.wav" - sf.write(audio_file, audio_int16, sr) - # return audio_file, audio_file, image - return audio_file - - -async def final_update(audio, image, model_choice): - res_video = await gen_video(image, audio, model_choice) - return res_video - - -# %% Main -if __name__ == "__main__": - # HOST_IP = os.getenv("host_ip") - HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode("utf-8").strip() - - # Fetch the AudioQnA backend server - ai_chatbot_url = f"http://{HOST_IP}:3008/v1/audioqna" - - # Collect chat history to print in the interface - chat_history = "" - - # Prepare 3 image paths - # HOME = os.getenv("HOME") - # HOME="/mnt/localdisk4" - HOME = "/home/demo/" - image_pils = [ - Image.open(os.path.join("../assets/img/woman1.png")), - Image.open(os.path.join("../assets/img/man1.png")), - Image.open(os.path.join("../assets/img/woman2.png")), - ] - - video_paths = [ - os.path.join("../assets/video/man1.mp4"), - os.path.join("../assets/video/woman2.mp4"), - os.path.join("../assets/video/man4.mp4"), - ] - - def image_to_base64(image_path): - with open(image_path, "rb") as image_file: - return base64.b64encode(image_file.read()).decode("utf-8") - - # Convert your images to Base64 - # opea_qr_base64 = image_to_base64('../rfcs/opea_qr.png') - # opea_gh_qr_base64 = image_to_base64('../rfcs/opea_gh_qr.png') - xeon_base64 = image_to_base64("../rfcs/xeon.jpg") - gaudi_base64 = image_to_base64("../rfcs/gaudi.png") - - # List of prerecorded WAV files containing audio questions - audio_filepaths = [ - "../assets/audio/intel1.wav", - "../assets/audio/intel2.wav", - "../assets/audio/intel3.wav", - "../assets/audio/intel4.wav", - "../assets/audio/pnp1.wav", - "../assets/audio/pnp2.wav", - "../assets/audio/pnp3.wav", - "../assets/audio/pnp4.wav", - "../assets/audio/entertainment1.wav", - "../assets/audio/entertainment2.wav", - ] - audio_questions = [ - "1. What are the latest data center processor and AI accelerator products at Intel? Name them.", - "2. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?", - "3. What is Intel's Gaudi 3 AI Accelerator performance compared to Nvidia H100?", - "4. What kinds of Intel AI tools are available to accelerate AI workloads?", - "5. What is Plug and Play Technology Center? Where is it located?", - "6. Tell us about inflation in the US in the past few years?", - "7. What is the difference between an index fund and a mutual fund?", - "8. What is the difference between pretax and roth retirement accounts?", - "9. Which team won the Superbowl in 2022?", - "10. In the Lord of the Rings, who threw the Ring into Mount Doom?", - ] - - # Demo frontend - demo = gr.Blocks() - with demo: - # Define processing functions - count = 0 - - def initial_process(audio_input): - global count, chat_history - start_time = time.time() - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - audio_file = loop.run_until_complete(aiavatar_demo(audio_input)) - count += 1 - end_time = time.time() - return audio_file, gr.State(value=str(count)), f"{(end_time - start_time):.1f} seconds", chat_history - - def final_process(audio, image, model_choice): - start_time = time.time() - # loop = asyncio.get_event_loop() - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - res_video = loop.run_until_complete(final_update(audio, image, model_choice)) - end_time = time.time() - return res_video, f"{(end_time - start_time):.1f} seconds" - - def update_selected_image_state(image_index): - selected_image_state.value = image_index - # change image_input here - if image_index < len(image_pils): - return f"inputs/face_{image_index}.png" - else: - return f"inputs/video_{image_index - len(image_pils)}.mp4" - - def update_audio_input(audio_choice): - if audio_choice: - audio_index = int(audio_choice.split(".")[0]) - 1 - audio_filepath_gradio = f"inputs/audio_{audio_index:d}.wav" - shutil.copyfile(audio_filepaths[audio_index], audio_filepath_gradio) - # audio_input.value = audio_filepath_gradio - return audio_filepath_gradio - - # UI Components - # Title & Introduction - gr.Markdown("

A PyTorch and OPEA based AI Avatar Audio Chatbot

") - # gr.Markdown("# **Using OPEA to implement a RAG-Powered Human-Like AI Avatar Audio Chatbot**") - with gr.Row(): - with gr.Column(scale=8): - gr.Markdown( - """ -

Welcome to our AI Avatar Audio Chatbot! This application leverages PyTorch and OPEA (Open Platform for Enterprise AI) v0.8 to provide you with a human-like conversational experience. It's run on Intel® Gaudi® AI Accelerator and Intel® Xeon® Processor, with hardware and software optimizations.
- Please feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.

- """ - ) - with gr.Column(scale=1): - # with gr.Row(): - # gr.Markdown(f""" - # OPEA QR Code - # """, label="OPEA QR Code") - # gr.Markdown(f""" - # OPEA GitHub QR Code - # """, label="OPEA GitHub QR Code") - with gr.Row(): - gr.Markdown( - f""" - Intel®Gaudi""", - label="Intel®Gaudi", - ) - gr.Markdown( - f""" - Intel®Xeon""", - label="Intel®Xeon", - ) - gr.Markdown("
") # Divider - - # Inputs - # Image gallery - selected_image_state = gr.State(value=-1) - image_clicks = [] - image_click_buttons = [] - video_clicks = [] - video_click_buttons = [] - with gr.Row(): - with gr.Column(scale=1): - audio_input = gr.Audio(sources=None, format="wav", label="🎤 or 📤 for your Input audio!") - audio_choice = gr.Dropdown( - choices=audio_questions, - label="Choose an audio question", - value=None, # default value - ) - # Update audio_input when a selection is made from the dropdown - audio_choice.change(fn=update_audio_input, inputs=audio_choice, outputs=audio_input) - - face_input = gr.File( - file_count="single", - file_types=["image", "video"], - label="Choose an avatar or 📤 an image or video!", - ) - model_choice = gr.Dropdown( - choices=["wav2lip", "wav2lip+GAN", "wav2lip+GFPGAN"], - label="Choose a DL model", - ) - with gr.Column(scale=2): - # Display 3 images and buttons - with gr.Row(): - for i, image_pil in enumerate(image_pils): - image_pil = resize_image(image_pil) - save_path = f"inputs/face_{i}.png" - image_pil.save(save_path, "PNG") - image_clicks.append(gr.Image(type="filepath", value=save_path, label=f"Avatar {i+1}")) - with gr.Row(): - for i in range(len(image_pils)): - image_click_buttons.append(gr.Button(f"Use Image {i+1}")) - # Display 3 videos and buttons - with gr.Row(): - for i, video_path in enumerate(video_paths): - save_path = f"inputs/video_{i}.mp4" - # shutil.copyfile(video_path, save_path) - resize_video(video_path, save_path) - video_clicks.append(gr.Video(value=save_path, label=f"Video {i+1}")) - with gr.Row(): - for i in range(len(video_paths)): - video_click_buttons.append(gr.Button(f"Use Video {i+1}")) - - submit_button = gr.Button("Submit") - - # Outputs - gr.Markdown("
") # Divider - with gr.Row(): - with gr.Column(scale=1): - audio_output_interm = gr.Audio(label="🔊 Output audio", autoplay=True) - chat_history_box = gr.Textbox(label="Chat History", value=chat_history) - audio_time_text = gr.Textbox(label="Audio processing time", value="0.0 seconds") - with gr.Column(scale=2): - video_output = gr.Video(label="Your AI Avatar video: ", format="mp4", width=1280, height=720) - video_time_text = gr.Textbox(label="Video processing time", value="0.0 seconds") - - # Technical details - gr.Markdown("
") # Divider - with gr.Row(): - gr.Markdown( - """ -

OPEA megaservice deployed:
-

    -
  • AvatarChatbot
  • -

-

OPEA microservices deployed: -

    -
  • ASR (service: opea/whisper-gaudi, model: openai/whisper-small)
  • -
  • LLM 'text-generation' (service: opea/llm-tgi, model: Intel/neural-chat-7b-v3-3)
  • -
  • TTS (service: opea/speecht5-gaudi, model: microsoft/speecht5_tts)
  • -
  • Animation (service: opea/animation, model: wav2lip+gfpgan)
  • -

- """ - ) - #

OPEA's "AvatarChatbot" megaservice is composed of "ASR->LLM->TTS->Animation" microservices. It first generates an expert answer based on your query, and then animates the avatar figure with output audio. Feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.

- with gr.Row(): - gr.Image("./flowchart_1.png", label="Megaservice Flowchart") - with gr.Row(): - gr.Markdown( - """ -

The AI Avatar Audio Chatbot is powered by the following Intel® AI software:
-

    -
  • Intel Gaudi Software v1.17.0
  • -
  • PyTorch v2.3.1 (Eager mode + torch.compile)
  • -
  • HPU Graph
  • -
  • Intel Neural Compressor (INC)
  • -

- """ - ) - - # Disclaimer - gr.Markdown("
") # Divider - gr.Markdown("

Notices & Disclaimers

") - gr.Markdown( - """ -

Intel is committed to respecting human rights and avoiding complicity in human rights abuses. See Intel's Global Human Rights Principles. Intel's products and software are intended only to be used in applications that do not cause or contribute to a violation of an internationally recognized human right.

-

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

-

You may not use or facilitate the use of this document in connection with any infringement or other legal analysis concerning Intel products described herein. You agree to grant Intel a non-exclusive, royalty-free license to any patent claim thereafter drafted which includes subject matter disclosed herein.

- """ - ) - - # States - interm_state = gr.State(value="initial") - - # State transitions - for i in range(len(image_pils)): - image_click_buttons[i].click( - update_selected_image_state, inputs=[gr.Number(value=i, visible=False)], outputs=[face_input] - ) - for i in range(len(video_paths)): - video_click_buttons[i].click( - update_selected_image_state, - inputs=[gr.Number(value=i + len(image_pils), visible=False)], - outputs=[face_input], - ) - # submit_button = gr.Button("Submit") - submit_button.click( - initial_process, - inputs=[audio_input], - outputs=[ - audio_output_interm, - interm_state, - audio_time_text, - chat_history_box, - ], # need to change interm_state - ) - interm_state.change( - final_process, - inputs=[audio_output_interm, face_input, model_choice], - outputs=[video_output, video_time_text], - ) - - demo.queue().launch(server_name="0.0.0.0", server_port=7861) diff --git a/AvatarChatbot/ui/gradio/app_gradio_demo_avatarchatbot.py b/AvatarChatbot/ui/gradio/app_gradio_demo_avatarchatbot.py new file mode 100644 index 0000000000..19817d5051 --- /dev/null +++ b/AvatarChatbot/ui/gradio/app_gradio_demo_avatarchatbot.py @@ -0,0 +1,349 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import base64 +import io +import os +import shutil +import subprocess +import time + +import aiohttp +import docker +import ffmpeg +import gradio as gr +import numpy as np +import soundfile as sf +from PIL import Image + + +# %% Docker Management +def update_env_var_in_container(container_name, env_var, new_value): + return + + +# %% AudioQnA functions +def preprocess_audio(audio): + """The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,)""" + sr, y = audio + + # Convert to normalized float32 audio + y = y.astype(np.float32) + y /= np.max(np.abs(y)) + + # Save to memory + buf = io.BytesIO() + sf.write(buf, y, sr, format="WAV") + buf.seek(0) # Reset the buffer position to the beginning + + # Encode the WAV file to base64 string + base64_bytes = base64.b64encode(buf.read()) + base64_string = base64_bytes.decode("utf-8") + return base64_string + + +def base64_to_int16(base64_string): + wav_bytes = base64.b64decode(base64_string) + buf = io.BytesIO(wav_bytes) + y, sr = sf.read(buf, dtype="int16") + return sr, y + + +async def transcribe(audio_input, face_input, model_choice): + """Input: mic audio; Output: ai audio, text, text""" + global ai_chatbot_url, chat_history, count + chat_history = "" + # Preprocess the audio + base64bytestr = preprocess_audio(audio_input) + + # Send the audio to the AvatarChatbot backend server endpoint + initial_inputs = {"audio": base64bytestr, "max_tokens": 64} + + # TO-DO: update wav2lip-service with the chosen face_input + # update_env_var_in_container("wav2lip-service", "DEVICE", "new_device_value") + + async with aiohttp.ClientSession() as session: + async with session.post(ai_chatbot_url, json=initial_inputs) as response: + + # Check the response status code + if response.status == 200: + # response_json = await response.json() + # # Decode the base64 string + # sampling_rate, audio_int16 = base64_to_int16(response_json["byte_str"]) + # chat_history += f"User: {response_json['query']}\n\n" + # chat_ai = response_json["text"] + # hitted_ends = [",", ".", "?", "!", "。", ";"] + # last_punc_idx = max([chat_ai.rfind(punc) for punc in hitted_ends]) + # if last_punc_idx != -1: + # chat_ai = chat_ai[: last_punc_idx + 1] + # chat_history += f"AI: {chat_ai}" + # chat_history = chat_history.replace("OPEX", "OPEA") + # return (sampling_rate, audio_int16) # handle the response + + result = await response.text() + return "docker_compose/intel/hpu/gaudi/result.mp4" + else: + return {"error": "Failed to transcribe audio", "status_code": response.status_code} + + +def resize_image(image_pil, size=(720, 720)): + """Resize the image to the specified size.""" + return image_pil.resize(size, Image.LANCZOS) + + +def resize_video(video_path, save_path, size=(720, 1280)): + """Resize the video to the specified size, and save to the save path.""" + ffmpeg.input(video_path).output(save_path, vf=f"scale={size[0]}:{size[1]}").overwrite_output().run() + + +# %% AI Avatar demo function +async def aiavatar_demo(audio_input, face_input, model_choice): + """Input: mic/preloaded audio, avatar file path; + Output: ai video""" + # Wait for response from AvatarChatbot backend + output_video = await transcribe(audio_input, face_input, model_choice) # output video path + + if isinstance(output_video, dict): # in case of an error + return None, None + else: + return output_video + + +# %% Main +if __name__ == "__main__": + # HOST_IP = os.getenv("host_ip") + HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode("utf-8").strip() + + # Fetch the AudioQnA backend server + ai_chatbot_url = f"http://{HOST_IP}:3009/v1/avatarchatbot" + + # Collect chat history to print in the interface + chat_history = "" + + # Prepare 3 image paths and 3 video paths + # image_pils = [ + # Image.open(os.path.join("assets/img/woman1.png")), + # Image.open(os.path.join("assets/img/man1.png")), + # Image.open(os.path.join("assets/img/woman2.png")), + # ] + + # video_paths = [ + # os.path.join("assets/video/man1.mp4"), + # os.path.join("assets/video/woman2.mp4"), + # os.path.join("assets/video/man4.mp4"), + # ] + + def image_to_base64(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + # Convert your images to Base64 + xeon_base64 = image_to_base64("assets/img/xeon.jpg") + gaudi_base64 = image_to_base64("assets/img/gaudi.png") + + # List of prerecorded WAV files containing audio questions + # audio_filepaths = [ + # "assets/audio/intel2.wav", + # "assets/audio/intel4.wav", + # ] + # audio_questions = [ + # "1. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?", + # "2. What kinds of Intel AI tools are available to accelerate AI workloads?", + # ] + + # Demo frontend + demo = gr.Blocks() + with demo: + # Define processing functions + count = 0 + + # Make necessary folders: + if not os.path.exists("inputs"): + os.makedirs("inputs") + if not os.path.exists("outputs"): + os.makedirs("outputs") + + def initial_process(audio_input, face_input, model_choice): + global count + start_time = time.time() + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + video_file = loop.run_until_complete(aiavatar_demo(audio_input, face_input, model_choice)) + count += 1 + end_time = time.time() + return video_file, f"The entire application took {(end_time - start_time):.1f} seconds" + + # def update_selected_image_state(image_index): + # image_index = int(image_index) + # selected_image_state.value = image_index + # # change image_input here + # if image_index < len(image_pils): + # return f"inputs/face_{image_index}.png" + # else: + # return f"inputs/video_{image_index - len(image_pils)}.mp4" + + # def update_audio_input(audio_choice): + # if audio_choice: + # audio_index = int(audio_choice.split(".")[0]) - 1 + # audio_filepath_gradio = f"inputs/audio_{audio_index:d}.wav" + # shutil.copyfile(audio_filepaths[audio_index], audio_filepath_gradio) + # return audio_filepath_gradio + + # UI Components + # Title & Introduction + gr.Markdown("

A PyTorch and OPEA based AI Avatar Audio Chatbot

") + with gr.Row(): + with gr.Column(scale=8): + gr.Markdown( + """ +

Welcome to our AI Avatar Audio Chatbot! This application leverages PyTorch and OPEA (Open Platform for Enterprise AI) v0.8 to provide you with a human-like conversational experience. It's run on Intel® Gaudi® AI Accelerator and Intel® Xeon® Processor, with hardware and software optimizations.
+ Please feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.

+ """ + ) + with gr.Column(scale=1): + # with gr.Row(): + # gr.Markdown(f""" + # OPEA QR Code + # """, label="OPEA QR Code") + # gr.Markdown(f""" + # OPEA GitHub QR Code + # """, label="OPEA GitHub QR Code") + with gr.Row(): + gr.Markdown( + f""" + Intel®Gaudi""", + label="Intel®Gaudi", + ) + gr.Markdown( + f""" + Intel®Xeon""", + label="Intel®Xeon", + ) + gr.Markdown("
") # Divider + + # Inputs + # Image gallery + selected_image_state = gr.State(value=-1) + image_clicks = [] + image_click_buttons = [] + video_clicks = [] + video_click_buttons = [] + with gr.Row(): + with gr.Column(scale=1): + audio_input = gr.Audio( + sources=["upload", "microphone"], format="wav", label="🎤 or 📤 for your Input audio!" + ) + # audio_choice = gr.Dropdown( + # choices=audio_questions, + # label="Choose an audio question", + # value=None, # default value + # ) + # Update audio_input when a selection is made from the dropdown + # audio_choice.change(fn=update_audio_input, inputs=audio_choice, outputs=audio_input) + + face_input = gr.File( + file_count="single", + file_types=["image", "video"], + label="Choose an avatar or 📤 an image or video!", + ) + model_choice = gr.Dropdown( + choices=["wav2lip", "wav2lip+GAN", "wav2lip+GFPGAN"], + label="Choose a DL model", + ) + # with gr.Column(scale=2): + # # Display 3 images and buttons + # with gr.Row(): + # for i, image_pil in enumerate(image_pils): + # image_pil = resize_image(image_pil) + # save_path = f"inputs/face_{int(i)}.png" + # image_pil.save(save_path, "PNG") + # image_clicks.append(gr.Image(type="filepath", value=save_path, label=f"Avatar {int(i)+1}")) + # with gr.Row(): + # for i in range(len(image_pils)): + # image_click_buttons.append(gr.Button(f"Use Image {i+1}")) + + # # Display 3 videos and buttons + # with gr.Row(): + # for i, video_path in enumerate(video_paths): + # save_path = f"inputs/video_{int(i)}.mp4" + # resize_video(video_path, save_path) + # video_clicks.append(gr.Video(value=save_path, label=f"Video {int(i)+1}")) + # with gr.Row(): + # for i in range(len(video_paths)): + # video_click_buttons.append(gr.Button(f"Use Video {int(i)+1}")) + + submit_button = gr.Button("Submit") + + # Outputs + gr.Markdown("
") # Divider + with gr.Row(): + with gr.Column(): + video_output = gr.Video(label="Your AI Avatar video: ", format="mp4", width=1280, height=720) + video_time_text = gr.Textbox(label="Video processing time", value="0.0 seconds") + + # Technical details + gr.Markdown("
") # Divider + with gr.Row(): + gr.Markdown( + """ +

OPEA megaservice deployed:
+

    +
  • AvatarChatbot
  • +

+

OPEA microservices deployed: +

    +
  • ASR (service: opea/whisper-gaudi, model: openai/whisper-small)
  • +
  • LLM 'text-generation' (service: opea/llm-tgi, model: Intel/neural-chat-7b-v3-3)
  • +
  • TTS (service: opea/speecht5-gaudi, model: microsoft/speecht5_tts)
  • +
  • Animation (service: opea/animation, model: wav2lip+gfpgan)
  • +

+ """ + ) + with gr.Row(): + gr.Image("assets/img/flowchart.png", label="Megaservice Flowchart") + with gr.Row(): + gr.Markdown( + """ +

The AI Avatar Audio Chatbot is powered by the following Intel® AI software:
+

    +
  • Intel Gaudi Software v1.17.0
  • +
  • PyTorch v2.3.1 (Eager mode + torch.compile)
  • +
  • HPU Graph
  • +
  • Intel Neural Compressor (INC)
  • +

+ """ + ) + + # Disclaimer + gr.Markdown("
") # Divider + gr.Markdown("

Notices & Disclaimers

") + gr.Markdown( + """ +

Intel is committed to respecting human rights and avoiding complicity in human rights abuses. See Intel's Global Human Rights Principles. Intel's products and software are intended only to be used in applications that do not cause or contribute to a violation of an internationally recognized human right.

+

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.

+

You may not use or facilitate the use of this document in connection with any infringement or other legal analysis concerning Intel products described herein. You agree to grant Intel a non-exclusive, royalty-free license to any patent claim thereafter drafted which includes subject matter disclosed herein.

+ """ + ) + + # State transitions + # for i in range(len(image_pils)): + # image_click_buttons[i].click( + # update_selected_image_state, inputs=[gr.Number(value=i, visible=False)], outputs=[face_input] + # ) + # for i in range(len(video_paths)): + # video_click_buttons[i].click( + # update_selected_image_state, + # inputs=[gr.Number(value=i + len(image_pils), visible=False)], + # outputs=[face_input], + # ) + submit_button.click( + initial_process, + inputs=[audio_input, face_input, model_choice], + outputs=[ + video_output, + video_time_text, + ], + ) + + demo.queue().launch(server_name="0.0.0.0", server_port=7861) diff --git a/ChatQnA/Dockerfile b/ChatQnA/Dockerfile index ee84069a25..4e431ac773 100644 --- a/ChatQnA/Dockerfile +++ b/ChatQnA/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \ pip install --no-cache-dir langchain_core diff --git a/ChatQnA/Dockerfile.guardrails b/ChatQnA/Dockerfile.guardrails index 168dfb138a..ed811148c0 100644 --- a/ChatQnA/Dockerfile.guardrails +++ b/ChatQnA/Dockerfile.guardrails @@ -18,7 +18,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \ pip install --no-cache-dir langchain_core diff --git a/ChatQnA/Dockerfile.without_rerank b/ChatQnA/Dockerfile.without_rerank index 030aef1594..7d3a94c5de 100644 --- a/ChatQnA/Dockerfile.without_rerank +++ b/ChatQnA/Dockerfile.without_rerank @@ -18,7 +18,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \ pip install --no-cache-dir langchain_core diff --git a/ChatQnA/Dockerfile.wrapper b/ChatQnA/Dockerfile.wrapper new file mode 100644 index 0000000000..c06a6811bd --- /dev/null +++ b/ChatQnA/Dockerfile.wrapper @@ -0,0 +1,34 @@ + + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + git + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +WORKDIR /home/user/ +RUN git clone https://github.com/opea-project/GenAIComps.git + +WORKDIR /home/user/GenAIComps +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt + +COPY ./chatqna_wrapper.py /home/user/chatqna.py + +ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps + +USER user + +WORKDIR /home/user + +RUN echo 'ulimit -S -n 999999' >> ~/.bashrc + +ENTRYPOINT ["python", "chatqna.py"] diff --git a/ChatQnA/README.md b/ChatQnA/README.md index 24569cc746..e3daf68508 100644 --- a/ChatQnA/README.md +++ b/ChatQnA/README.md @@ -4,7 +4,26 @@ Chatbots are the most widely adopted use case for leveraging the powerful chat a RAG bridges the knowledge gap by dynamically fetching relevant information from external sources, ensuring that responses generated remain factual and current. The core of this architecture are vector databases, which are instrumental in enabling efficient and semantic retrieval of information. These databases store data as vectors, allowing RAG to swiftly access the most pertinent documents or data points based on semantic similarity. -## Deploy ChatQnA Service +## 🤖 Automated Terraform Deployment using Intel® Optimized Cloud Modules for **Terraform** + +| Cloud Provider | Intel Architecture | Intel Optimized Cloud Module for Terraform | Comments | +| -------------------- | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------- | +| AWS | 4th Gen Intel Xeon with Intel AMX | [AWS Module](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna) | Uses Intel/neural-chat-7b-v3-3 by default | +| AWS Falcon2-11B | 4th Gen Intel Xeon with Intel AMX | [AWS Module with Falcon11B](https://github.com/intel/terraform-intel-aws-vm/tree/main/examples/gen-ai-xeon-opea-chatqna-falcon11B) | Uses TII Falcon2-11B LLM Model | +| GCP | 5th Gen Intel Xeon with Intel AMX | [GCP Module](https://github.com/intel/terraform-intel-gcp-vm/tree/main/examples/gen-ai-xeon-opea-chatqna) | Also supports Confidential AI by using Intel® TDX with 4th Gen Xeon | +| Azure | 5th Gen Intel Xeon with Intel AMX | Work-in-progress | Work-in-progress | +| Intel Tiber AI Cloud | 5th Gen Intel Xeon with Intel AMX | Work-in-progress | Work-in-progress | + +## Automated Deployment to Ubuntu based system(if not using Terraform) using Intel® Optimized Cloud Modules for **Ansible** + +To deploy to existing Xeon Ubuntu based system, use our Intel Optimized Cloud Modules for Ansible. This is the same Ansible playbook used by Terraform. +Use this if you are not using Terraform and have provisioned your system with another tool or manually including bare metal. +| Operating System | Intel Optimized Cloud Module for Ansible | +|------------------|------------------------------------------| +| Ubuntu 20.04 | [ChatQnA Ansible Module](https://github.com/intel/optimized-cloud-recipes/tree/main/recipes/ai-opea-chatqna-xeon) | +| Ubuntu 22.04 | Work-in-progress | + +## Manually Deploy ChatQnA Service The ChatQnA service can be effortlessly deployed on Intel Gaudi2, Intel Xeon Scalable Processors and Nvidia GPU. diff --git a/ChatQnA/benchmark/accuracy/README.md b/ChatQnA/benchmark/accuracy/README.md index 0cfae4564b..c073139486 100644 --- a/ChatQnA/benchmark/accuracy/README.md +++ b/ChatQnA/benchmark/accuracy/README.md @@ -48,7 +48,7 @@ To setup a LLM model, we can use [tgi-gaudi](https://github.com/huggingface/tgi- docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 # for better performance, set `PREFILL_BATCH_BUCKET_SIZE`, `BATCH_BUCKET_SIZE`, `max-batch-total-tokens`, `max-batch-prefill-tokens` -docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048 +docker run -p {your_llm_port}:80 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HF_TOKEN={your_hf_token} -e PREFILL_BATCH_BUCKET_SIZE=1 -e BATCH_BUCKET_SIZE=8 --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id mistralai/Mixtral-8x7B-Instruct-v0.1 --max-input-tokens 2048 --max-total-tokens 4096 --sharded true --num-shard 2 --max-batch-total-tokens 65536 --max-batch-prefill-tokens 2048 ``` ### Prepare Dataset diff --git a/ChatQnA/benchmark/accuracy/eval_multihop.py b/ChatQnA/benchmark/accuracy/eval_multihop.py index 9b07ea2e34..a8f2b9911a 100644 --- a/ChatQnA/benchmark/accuracy/eval_multihop.py +++ b/ChatQnA/benchmark/accuracy/eval_multihop.py @@ -41,11 +41,11 @@ def get_reranked_documents(self, query, docs, arguments): return [] def get_retrieved_documents(self, query, arguments): - data = {"text": query} + data = {"inputs": query} headers = {"Content-Type": "application/json"} - response = requests.post(arguments.embedding_endpoint, data=json.dumps(data), headers=headers) + response = requests.post(arguments.tei_embedding_endpoint + "/embed", data=json.dumps(data), headers=headers) if response.ok: - embedding = response.json()["embedding"] + embedding = response.json()[0] else: print(f"Request for embedding failed due to {response.text}.") return [] diff --git a/ChatQnA/benchmark/performance/README.md b/ChatQnA/benchmark/performance-deprecated/README.md similarity index 100% rename from ChatQnA/benchmark/performance/README.md rename to ChatQnA/benchmark/performance-deprecated/README.md diff --git a/ChatQnA/benchmark/performance/benchmark.sh b/ChatQnA/benchmark/performance-deprecated/benchmark.sh similarity index 100% rename from ChatQnA/benchmark/performance/benchmark.sh rename to ChatQnA/benchmark/performance-deprecated/benchmark.sh diff --git a/ChatQnA/benchmark/performance/benchmark.yaml b/ChatQnA/benchmark/performance-deprecated/benchmark.yaml similarity index 100% rename from ChatQnA/benchmark/performance/benchmark.yaml rename to ChatQnA/benchmark/performance-deprecated/benchmark.yaml diff --git a/ChatQnA/benchmark/performance/helm_charts/.helmignore b/ChatQnA/benchmark/performance-deprecated/helm_charts/.helmignore similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/.helmignore rename to ChatQnA/benchmark/performance-deprecated/helm_charts/.helmignore diff --git a/ChatQnA/benchmark/performance/helm_charts/Chart.yaml b/ChatQnA/benchmark/performance-deprecated/helm_charts/Chart.yaml similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/Chart.yaml rename to ChatQnA/benchmark/performance-deprecated/helm_charts/Chart.yaml diff --git a/ChatQnA/benchmark/performance/helm_charts/README.md b/ChatQnA/benchmark/performance-deprecated/helm_charts/README.md similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/README.md rename to ChatQnA/benchmark/performance-deprecated/helm_charts/README.md diff --git a/ChatQnA/benchmark/performance/helm_charts/customize.yaml b/ChatQnA/benchmark/performance-deprecated/helm_charts/customize.yaml similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/customize.yaml rename to ChatQnA/benchmark/performance-deprecated/helm_charts/customize.yaml diff --git a/ChatQnA/benchmark/performance/helm_charts/templates/configmap.yaml b/ChatQnA/benchmark/performance-deprecated/helm_charts/templates/configmap.yaml similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/templates/configmap.yaml rename to ChatQnA/benchmark/performance-deprecated/helm_charts/templates/configmap.yaml diff --git a/ChatQnA/benchmark/performance/helm_charts/templates/deployment.yaml b/ChatQnA/benchmark/performance-deprecated/helm_charts/templates/deployment.yaml similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/templates/deployment.yaml rename to ChatQnA/benchmark/performance-deprecated/helm_charts/templates/deployment.yaml diff --git a/ChatQnA/benchmark/performance/helm_charts/templates/service.yaml b/ChatQnA/benchmark/performance-deprecated/helm_charts/templates/service.yaml similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/templates/service.yaml rename to ChatQnA/benchmark/performance-deprecated/helm_charts/templates/service.yaml diff --git a/ChatQnA/benchmark/performance/helm_charts/values.yaml b/ChatQnA/benchmark/performance-deprecated/helm_charts/values.yaml similarity index 100% rename from ChatQnA/benchmark/performance/helm_charts/values.yaml rename to ChatQnA/benchmark/performance-deprecated/helm_charts/values.yaml diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml index 8e74fe6adf..0f7d6176bb 100644 --- a/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -327,7 +327,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml index 95f00644c7..4a5b7b6010 100644 --- a/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -327,7 +327,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml index 4fb1657076..9a8ce4a4b5 100644 --- a/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -327,7 +327,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml index 0d379f5b5f..c80fc03e33 100644 --- a/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -327,7 +327,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml index 42e9ed4d47..91554a8121 100644 --- a/ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml index 0338a8768b..7b81e252a3 100644 --- a/ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml index 9d423ffafc..61346908fc 100644 --- a/ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml index f405bcce6c..72ada01914 100644 --- a/ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml @@ -237,7 +237,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml index 9b47fa0be5..a9d63cb817 100644 --- a/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -345,7 +345,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml index 813a8e44c2..7ec356d931 100644 --- a/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -345,7 +345,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml index 32e5bd8843..f64be532a8 100644 --- a/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -345,7 +345,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml index 055f66f0db..ecf8de7b56 100644 --- a/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: @@ -345,7 +345,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 imagePullPolicy: IfNotPresent name: reranking-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml index 5d6793cd90..365cd5ab5a 100644 --- a/ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml index 0cfb09b0f6..6af61b1ffb 100644 --- a/ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml index a9f7e46391..dc56cc96fb 100644 --- a/ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml similarity index 99% rename from ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml rename to ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml index a072d36ff5..f14ebc2154 100644 --- a/ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml +++ b/ChatQnA/benchmark/performance-deprecated/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml @@ -255,7 +255,7 @@ spec: envFrom: - configMapRef: name: qna-config - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent name: llm-dependency-deploy ports: diff --git a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/README.md b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/README.md new file mode 100644 index 0000000000..d667727f48 --- /dev/null +++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/README.md @@ -0,0 +1,204 @@ +# ChatQnA Benchmarking + +This folder contains a collection of Kubernetes manifest files for deploying the ChatQnA service across scalable nodes. It includes a comprehensive [benchmarking tool](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md) that enables throughput analysis to assess inference performance. + +By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community. + +## Purpose + +We aim to run these benchmarks and share them with the OPEA community for three primary reasons: + +- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs. +- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case. +- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc. + +## Metrics + +The benchmark will report the below metrics, including: + +- Number of Concurrent Requests +- End-to-End Latency: P50, P90, P99 (in milliseconds) +- End-to-End First Token Latency: P50, P90, P99 (in milliseconds) +- Average Next Token Latency (in milliseconds) +- Average Token Latency (in milliseconds) +- Requests Per Second (RPS) +- Output Tokens Per Second +- Input Tokens Per Second + +Results will be displayed in the terminal and saved as CSV file named `1_stats.csv` for easy export to spreadsheets. + +## Table of Contents + +- [Deployment](#deployment) + - [Prerequisites](#prerequisites) + - [Deployment Scenarios](#deployment-scenarios) + - [Case 1: Baseline Deployment with Rerank](#case-1-baseline-deployment-with-rerank) + - [Case 2: Baseline Deployment without Rerank](#case-2-baseline-deployment-without-rerank) + - [Case 3: Tuned Deployment with Rerank](#case-3-tuned-deployment-with-rerank) +- [Benchmark](#benchmark) + - [Test Configurations](#test-configurations) + - [Test Steps](#test-steps) + - [Upload Retrieval File](#upload-retrieval-file) + - [Run Benchmark Test](#run-benchmark-test) + - [Data collection](#data-collection) +- [Teardown](#teardown) + +## Deployment + +### Prerequisites + +- Kubernetes installation: Use [kubespray](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md) or other official Kubernetes installation guides. +- Helm installation: Follow the [Helm documentation](https://helm.sh/docs/intro/install/#helm) to install Helm. +- Setup Hugging Face Token + + To access models and APIs from Hugging Face, set your token as environment variable. + ```bash + export HF_TOKEN="insert-your-huggingface-token-here" + ``` +- Prepare Shared Models (Optional but Strongly Recommended) + + Downloading models simultaneously to multiple nodes in your cluster can overload resources such as network bandwidth, memory and storage. To prevent resource exhaustion, it's recommended to preload the models in advance. + ```bash + pip install -U "huggingface_hub[cli]" + sudo mkdir -p /mnt/models + sudo chmod 777 /mnt/models + huggingface-cli download --cache-dir /mnt/models Intel/neural-chat-7b-v3-3 + export MODEL_DIR=/mnt/models + ``` + Once the models are downloaded, you can consider the following methods for sharing them across nodes: + - Persistent Volume Claim (PVC): This is the recommended approach for production setups. For more details on using PVC, refer to [PVC](https://github.com/opea-project/GenAIInfra/blob/main/helm-charts/README.md#using-persistent-volume). + - Local Host Path: For simpler testing, ensure that each node involved in the deployment follows the steps above to locally prepare the models. After preparing the models, use `--set global.modelUseHostPath=${MODELDIR}` in the deployment command. + +- Add OPEA Helm Repository: + ```bash + python deploy.py --add-repo + ``` +- Label Nodes + ```base + python deploy.py --add-label --num-nodes 2 + ``` + +### Deployment Scenarios + +The example below are based on a two-node setup. You can adjust the number of nodes by using the `--num-nodes` option. + +By default, these commands use the `default` namespace. To specify a different namespace, use the `--namespace` flag with deploy, uninstall, and kubernetes command. Additionally, update the `namespace` field in `benchmark.yaml` before running the benchmark test. + +For additional configuration options, run `python deploy.py --help` + +#### Case 1: Baseline Deployment with Rerank + +Deploy Command (with node number, Hugging Face token, model directory specified): +```bash +python deploy.py --hf-token $HF_TOKEN --model-dir $MODEL_DIR --num-nodes 2 --with-rerank +``` +Uninstall Command: +```bash +python deploy.py --uninstall +``` + +#### Case 2: Baseline Deployment without Rerank + +```bash +python deploy.py --hf-token $HFTOKEN --model-dir $MODELDIR --num-nodes 2 +``` +#### Case 3: Tuned Deployment with Rerank + +```bash +python deploy.py --hf-token $HFTOKEN --model-dir $MODELDIR --num-nodes 2 --with-rerank --tuned +``` + +## Benchmark + +### Test Configurations + +| Key | Value | +| -------- | ------- | +| Workload | ChatQnA | +| Tag | V1.1 | + +Models configuration +| Key | Value | +| ---------- | ------------------ | +| Embedding | BAAI/bge-base-en-v1.5 | +| Reranking | BAAI/bge-reranker-base | +| Inference | Intel/neural-chat-7b-v3-3 | + +Benchmark parameters +| Key | Value | +| ---------- | ------------------ | +| LLM input tokens | 1024 | +| LLM output tokens | 128 | + +Number of test requests for different scheduled node number: +| Node count | Concurrency | Query number | +| ----- | -------- | -------- | +| 1 | 128 | 640 | +| 2 | 256 | 1280 | +| 4 | 512 | 2560 | + +More detailed configuration can be found in configuration file [benchmark.yaml](./benchmark.yaml). + +### Test Steps + +Use `kubectl get pods` to confirm that all pods are `READY` before starting the test. + +#### Upload Retrieval File + +Before testing, upload a specified file to make sure the llm input have the token length of 1k. + +Get files: + +```bash +wget https://github.com/opea-project/GenAIEval/tree/main/evals/benchmark/data/upload_file_no_rerank.txt +wget https://github.com/opea-project/GenAIEval/tree/main/evals/benchmark/data/upload_file.txt +``` + +Retrieve the `ClusterIP` of the `chatqna-data-prep` service. + +```bash +kubectl get svc +``` +Expected output: +```log +chatqna-data-prep ClusterIP xx.xx.xx.xx 6007/TCP 51m +``` + +Use the following `cURL` command to upload file: + +```bash +cd GenAIEval/evals/benchmark/data +# RAG with Rerank +curl -X POST "http://${cluster_ip}:6007/v1/dataprep" \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./upload_file.txt" +# RAG without Rerank +curl -X POST "http://${cluster_ip}:6007/v1/dataprep" \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./upload_file_no_rerank.txt" +``` + +#### Run Benchmark Test + +Run the benchmark test using: +```bash +bash benchmark.sh -n 2 +``` +The `-n` argument specifies the number of test nodes. Required dependencies will be automatically installed when running the benchmark for the first time. + +#### Data collection + +All the test results will come to the folder `GenAIEval/evals/benchmark/benchmark_output`. + +## Teardown + +After completing the benchmark, use the following commands to clean up the environment: + +Remove Node Labels: +```base +python deploy.py --delete-label +``` +Delete the OPEA Helm Repository: +```bash +python deploy.py --delete-repo +``` diff --git a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.sh b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.sh new file mode 100755 index 0000000000..ba69f4e963 --- /dev/null +++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deployment_type="k8s" +node_number=1 +service_port=8888 +query_per_node=640 + +benchmark_tool_path="$(pwd)/GenAIEval" + +usage() { + echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]" + echo " -d deployment_type ChatQnA deployment type, select between k8s and docker (default: k8s)" + echo " -n node_number Test node number, required only for k8s deployment_type, (default: 1)" + echo " -i service_ip chatqna service ip, required only for docker deployment_type" + echo " -p service_port chatqna service port, required only for docker deployment_type, (default: 8888)" + exit 1 +} + +while getopts ":d:n:i:p:" opt; do + case ${opt} in + d ) + deployment_type=$OPTARG + ;; + n ) + node_number=$OPTARG + ;; + i ) + service_ip=$OPTARG + ;; + p ) + service_port=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then + echo "Error: service_ip is required for docker deployment_type" 1>&2 + usage +fi + +if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then + echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2 +fi + +function main() { + if [[ ! -d ${benchmark_tool_path} ]]; then + echo "Benchmark tool not found, setting up..." + setup_env + fi + run_benchmark +} + +function setup_env() { + git clone https://github.com/opea-project/GenAIEval.git + pushd ${benchmark_tool_path} + python3 -m venv stress_venv + source stress_venv/bin/activate + pip install -r requirements.txt + popd +} + +function run_benchmark() { + source ${benchmark_tool_path}/stress_venv/bin/activate + export DEPLOYMENT_TYPE=${deployment_type} + export SERVICE_IP=${service_ip:-"None"} + export SERVICE_PORT=${service_port:-"None"} + if [[ -z $USER_QUERIES ]]; then + user_query=$((query_per_node*node_number)) + export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]" + echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}." + fi + export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//') + if [[ -z $WARMUP ]]; then export WARMUP=0; fi + if [[ -z $TEST_OUTPUT_DIR ]]; then + if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}" + else + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker" + fi + echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}." + fi + + envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml + cd ${benchmark_tool_path}/evals/benchmark + python benchmark.py +} + +main diff --git a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml new file mode 100644 index 0000000000..1d4ae4794e --- /dev/null +++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/benchmark.yaml @@ -0,0 +1,69 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +test_suite_config: # Overall configuration settings for the test suite + examples: ["chatqna"] # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna + deployment_type: ${DEPLOYMENT_TYPE} # Default is "k8s", can also be "docker" + service_ip: ${SERVICE_IP} # Leave as None for k8s, specify for Docker + service_port: ${SERVICE_PORT} # Leave as None for k8s, specify for Docker + warm_ups: ${WARMUP} # Number of test requests for warm-up + run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs + user_queries: ${USER_QUERIES} # Number of test requests at each concurrency level + query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. + random_prompt: false # Use random prompts if true, fixed prompts if false + collect_service_metric: false # Collect service metrics if true, do not collect service metrics if false + data_visualization: false # Generate data visualization if true, do not generate data visualization if false + llm_model: "Intel/neural-chat-7b-v3-3" # The LLM model used for the test + test_output_dir: "${TEST_OUTPUT_DIR}" # The directory to store the test output + load_shape: # Tenant concurrency pattern + name: constant # poisson or constant(locust default load shape) + params: # Loadshape-specific parameters + constant: # Constant load shape specific parameters, activate only if load_shape.name is constant + concurrent_level: 5 # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users + # arrival_rate: 1.0 # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate + poisson: # Poisson load shape specific parameters, activate only if load_shape.name is poisson + arrival_rate: 1.0 # Request arrival rate + namespace: "my-chatqna" + +test_cases: + chatqna: + embedding: + run_test: false + service_name: "chatqna-embedding-usvc" # Replace with your service name + embedserve: + run_test: false + service_name: "chatqna-tei" # Replace with your service name + retriever: + run_test: false + service_name: "chatqna-retriever-usvc" # Replace with your service name + parameters: + search_type: "similarity" + k: 4 + fetch_k: 20 + lambda_mult: 0.5 + score_threshold: 0.2 + reranking: + run_test: false + service_name: "chatqna-reranking-usvc" # Replace with your service name + parameters: + top_n: 1 + rerankserve: + run_test: false + service_name: "chatqna-teirerank" # Replace with your service name + llm: + run_test: false + service_name: "chatqna-llm-uservice" # Replace with your service name + parameters: + max_tokens: 128 + temperature: 0.01 + top_k: 10 + top_p: 0.95 + repetition_penalty: 1.03 + streaming: true + llmserve: + run_test: false + service_name: "chatqna-tgi" # Replace with your service name + e2e: + run_test: true + service_name: "chatqna" # Replace with your service name diff --git a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/deploy.py b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/deploy.py new file mode 100644 index 0000000000..fe8af99e74 --- /dev/null +++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/deploy.py @@ -0,0 +1,355 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import glob +import json +import os +import shutil +import subprocess +import sys + +import yaml +from generate_helm_values import generate_helm_values + + +def run_kubectl_command(command): + """Run a kubectl command and return the output.""" + try: + result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout + except subprocess.CalledProcessError as e: + print(f"Error running command: {command}\n{e.stderr}") + exit(1) + + +def get_all_nodes(): + """Get the list of all nodes in the Kubernetes cluster.""" + command = ["kubectl", "get", "nodes", "-o", "json"] + output = run_kubectl_command(command) + nodes = json.loads(output) + return [node["metadata"]["name"] for node in nodes["items"]] + + +def add_label_to_node(node_name, label): + """Add a label to the specified node.""" + command = ["kubectl", "label", "node", node_name, label, "--overwrite"] + print(f"Labeling node {node_name} with {label}...") + run_kubectl_command(command) + print(f"Label {label} added to node {node_name} successfully.") + + +def add_labels_to_nodes(node_count=None, label=None, node_names=None): + """Add a label to the specified number of nodes or to specified nodes.""" + + if node_names: + # Add label to the specified nodes + for node_name in node_names: + add_label_to_node(node_name, label) + else: + # Fetch the node list and label the specified number of nodes + all_nodes = get_all_nodes() + if node_count is None or node_count > len(all_nodes): + print(f"Error: Node count exceeds the number of available nodes ({len(all_nodes)} available).") + sys.exit(1) + + selected_nodes = all_nodes[:node_count] + for node_name in selected_nodes: + add_label_to_node(node_name, label) + + +def clear_labels_from_nodes(label, node_names=None): + """Clear the specified label from specific nodes if provided, otherwise from all nodes.""" + label_key = label.split("=")[0] # Extract key from 'key=value' format + + # If specific nodes are provided, use them; otherwise, get all nodes + nodes_to_clear = node_names if node_names else get_all_nodes() + + for node_name in nodes_to_clear: + # Check if the node has the label by inspecting its metadata + command = ["kubectl", "get", "node", node_name, "-o", "json"] + node_info = run_kubectl_command(command) + node_metadata = json.loads(node_info) + + # Check if the label exists on this node + labels = node_metadata["metadata"].get("labels", {}) + if label_key in labels: + # Remove the label from the node + command = ["kubectl", "label", "node", node_name, f"{label_key}-"] + print(f"Removing label {label_key} from node {node_name}...") + run_kubectl_command(command) + print(f"Label {label_key} removed from node {node_name} successfully.") + else: + print(f"Label {label_key} not found on node {node_name}, skipping.") + + +def add_helm_repo(repo_name, repo_url): + # Add the repo if it does not exist + add_command = ["helm", "repo", "add", repo_name, repo_url] + try: + subprocess.run(add_command, check=True) + print(f"Added Helm repo {repo_name} from {repo_url}.") + except subprocess.CalledProcessError as e: + print(f"Failed to add Helm repo {repo_name}: {e}") + + +def delete_helm_repo(repo_name): + """Delete Helm repo if it exists.""" + command = ["helm", "repo", "remove", repo_name] + try: + subprocess.run(command, check=True) + print(f"Deleted Helm repo {repo_name}.") + except subprocess.CalledProcessError: + print(f"Failed to delete Helm repo {repo_name}. It may not exist.") + + +def configmap_exists(name, namespace): + """Check if a ConfigMap exists in the specified namespace.""" + check_command = ["kubectl", "get", "configmap", name, "-n", namespace] + result = subprocess.run(check_command, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return result.returncode == 0 + + +def create_configmap(name, namespace, data): + """Create a ConfigMap if it does not already exist.""" + if configmap_exists(name, namespace): + print(f"ConfigMap '{name}' already exists in namespace '{namespace}', skipping creation.") + else: + create_command = ( + ["kubectl", "create", "configmap", name] + + [f"--from-literal={k}={v}" for k, v in data.items()] + + ["-n", namespace] + ) + print(f"Creating ConfigMap '{name}' in namespace '{namespace}'...") + subprocess.run(create_command, check=True) + print(f"ConfigMap '{name}' created successfully.") + + +def delete_configmap(name, namespace): + """Delete a ConfigMap if it exists.""" + if configmap_exists(name, namespace): + delete_command = ["kubectl", "delete", "configmap", name, "-n", namespace] + print(f"Deleting ConfigMap '{name}'...") + subprocess.run(delete_command, check=True) + print(f"ConfigMap '{name}' deleted successfully.") + else: + print(f"ConfigMap '{name}' does not exist in namespace '{namespace}', skipping deletion.") + + +def install_helm_release(release_name, chart_name, namespace, values_file, device_type): + """Deploy a Helm release with a specified name and chart. + + Parameters: + - release_name: The name of the Helm release. + - chart_name: The Helm chart name or path, e.g., "opea/chatqna". + - namespace: The Kubernetes namespace for deployment. + - values_file: The user values file for deployment. + - device_type: The device type (e.g., "gaudi") for specific configurations (optional). + - extra_env_configmap_name: Name of the ConfigMap for extra environment variables (default "extra-env"). + """ + + # Check if the namespace exists; if not, create it + try: + # Check if the namespace exists + command = ["kubectl", "get", "namespace", namespace] + subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except subprocess.CalledProcessError: + # Namespace does not exist, create it + print(f"Namespace '{namespace}' does not exist. Creating it...") + command = ["kubectl", "create", "namespace", namespace] + subprocess.run(command, check=True) + print(f"Namespace '{namespace}' created successfully.") + + # This is workaround for teirerank-gaudi, will be removed later + create_configmap("extra-env", namespace, {"MAX_WARMUP_SEQUENCE_LENGTH": "512"}) + + # Handle gaudi-specific values file if device_type is "gaudi" + hw_values_file = None + untar_dir = None + if device_type == "gaudi": + print("Device type is gaudi. Pulling Helm chart to get gaudi-values.yaml...") + + # Pull and untar the chart + subprocess.run(["helm", "pull", chart_name, "--untar"], check=True) + + # Determine the directory name (get the actual chart_name if chart_name is in the format 'repo_name/chart_name', else use chart_name directly) + chart_dir_name = chart_name.split("/")[-1] if "/" in chart_name else chart_name + + # Find the untarred directory (assumes only one directory matches chart_dir_name) + untar_dirs = glob.glob(f"{chart_dir_name}*") + if untar_dirs: + untar_dir = untar_dirs[0] + hw_values_file = os.path.join(untar_dir, "gaudi-values.yaml") + print("gaudi-values.yaml pulled and ready for use.") + else: + print(f"Error: Could not find untarred directory for {chart_name}") + return + + # Prepare the Helm install command + command = ["helm", "install", release_name, chart_name, "--namespace", namespace] + + # Append additional values file for gaudi if it exists + if hw_values_file: + command.extend(["-f", hw_values_file]) + + # Append the main values file + command.extend(["-f", values_file]) + + # Execute the Helm install command + try: + print(f"Running command: {' '.join(command)}") # Print full command for debugging + subprocess.run(command, check=True) + print("Deployment initiated successfully.") + except subprocess.CalledProcessError as e: + print(f"Error occurred while deploying Helm release: {e}") + + # Cleanup: Remove the untarred directory + if untar_dir and os.path.isdir(untar_dir): + print(f"Removing temporary directory: {untar_dir}") + shutil.rmtree(untar_dir) + print("Temporary directory removed successfully.") + + +def uninstall_helm_release(release_name, namespace=None): + """Uninstall a Helm release and clean up resources, optionally delete the namespace if not 'default'.""" + # Default to 'default' namespace if none is specified + if not namespace: + namespace = "default" + + try: + # This is workaround for teirerank-gaudi, will be removed later + delete_configmap("extra-env", namespace) + + # Uninstall the Helm release + command = ["helm", "uninstall", release_name, "--namespace", namespace] + print(f"Uninstalling Helm release {release_name} in namespace {namespace}...") + run_kubectl_command(command) + print(f"Helm release {release_name} uninstalled successfully.") + + # If the namespace is specified and not 'default', delete it + if namespace != "default": + print(f"Deleting namespace {namespace}...") + delete_namespace_command = ["kubectl", "delete", "namespace", namespace] + run_kubectl_command(delete_namespace_command) + print(f"Namespace {namespace} deleted successfully.") + else: + print("Namespace is 'default', skipping deletion.") + + except subprocess.CalledProcessError as e: + print(f"Error occurred while uninstalling Helm release or deleting namespace: {e}") + + +def main(): + parser = argparse.ArgumentParser(description="Manage Helm Deployment.") + parser.add_argument( + "--release-name", + type=str, + default="chatqna", + help="The Helm release name created during deployment (default: chatqna).", + ) + parser.add_argument( + "--chart-name", + type=str, + default="opea/chatqna", + help="The chart name to deploy, composed of repo name and chart name (default: opea/chatqna).", + ) + parser.add_argument("--namespace", default="default", help="Kubernetes namespace (default: default).") + parser.add_argument("--hf-token", help="Hugging Face API token.") + parser.add_argument( + "--model-dir", help="Model directory, mounted as volumes for service access to pre-downloaded models" + ) + parser.add_argument("--repo-name", default="opea", help="Helm repo name to add/delete (default: opea).") + parser.add_argument( + "--repo-url", + default="https://opea-project.github.io/GenAIInfra", + help="Helm repository URL (default: https://opea-project.github.io/GenAIInfra).", + ) + parser.add_argument("--user-values", help="Path to a user-specified values.yaml file.") + parser.add_argument( + "--create-values-only", action="store_true", help="Only create the values.yaml file without deploying." + ) + parser.add_argument("--uninstall", action="store_true", help="Uninstall the Helm release.") + parser.add_argument("--num-nodes", type=int, default=1, help="Number of nodes to use (default: 1).") + parser.add_argument("--node-names", nargs="*", help="Optional specific node names to label.") + parser.add_argument("--add-label", action="store_true", help="Add label to specified nodes if this flag is set.") + parser.add_argument( + "--delete-label", action="store_true", help="Delete label from specified nodes if this flag is set." + ) + parser.add_argument( + "--label", default="node-type=opea-benchmark", help="Label to add/delete (default: node-type=opea-benchmark)." + ) + parser.add_argument("--with-rerank", action="store_true", help="Include rerank service in the deployment.") + parser.add_argument( + "--tuned", + action="store_true", + help="Modify resources for services and change extraCmdArgs when creating values.yaml.", + ) + parser.add_argument("--add-repo", action="store_true", help="Add the Helm repo specified by --repo-url.") + parser.add_argument("--delete-repo", action="store_true", help="Delete the Helm repo specified by --repo-name.") + parser.add_argument( + "--device-type", + type=str, + choices=["cpu", "gaudi"], + default="gaudi", + help="Specify the device type for deployment (choices: 'cpu', 'gaudi'; default: gaudi).", + ) + + args = parser.parse_args() + + # Adjust num-nodes based on node-names if specified + if args.node_names: + num_node_names = len(args.node_names) + if args.num_nodes != 1 and args.num_nodes != num_node_names: + parser.error("--num-nodes must match the number of --node-names if both are specified.") + else: + args.num_nodes = num_node_names + + # Helm repository management + if args.add_repo: + add_helm_repo(args.repo_name, args.repo_url) + return + elif args.delete_repo: + delete_helm_repo(args.repo_name) + return + + # Node labeling management + if args.add_label: + add_labels_to_nodes(args.num_nodes, args.label, args.node_names) + return + elif args.delete_label: + clear_labels_from_nodes(args.label, args.node_names) + return + + # Uninstall Helm release if specified + if args.uninstall: + uninstall_helm_release(args.release_name, args.namespace) + return + + # Prepare values.yaml if not uninstalling + if args.user_values: + values_file_path = args.user_values + else: + if not args.hf_token: + parser.error("--hf-token are required") + node_selector = {args.label.split("=")[0]: args.label.split("=")[1]} + values_file_path = generate_helm_values( + with_rerank=args.with_rerank, + num_nodes=args.num_nodes, + hf_token=args.hf_token, + model_dir=args.model_dir, + node_selector=node_selector, + tune=args.tuned, + ) + + # Read back the generated YAML file for verification + with open(values_file_path, "r") as file: + print("Generated YAML contents:") + print(file.read()) + + # Deploy unless --create-values-only is specified + if not args.create_values_only: + install_helm_release(args.release_name, args.chart_name, args.namespace, values_file_path, args.device_type) + + +if __name__ == "__main__": + main() diff --git a/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/generate_helm_values.py b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/generate_helm_values.py new file mode 100644 index 0000000000..b288818009 --- /dev/null +++ b/ChatQnA/benchmark/performance/kubernetes/intel/gaudi/generate_helm_values.py @@ -0,0 +1,167 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import yaml + + +def generate_helm_values(with_rerank, num_nodes, hf_token, model_dir, node_selector=None, tune=False): + """Create a values.yaml file based on the provided configuration.""" + + # Log the received parameters + print("Received parameters:") + print(f"with_rerank: {with_rerank}") + print(f"num_nodes: {num_nodes}") + print(f"node_selector: {node_selector}") # Log the node_selector + print(f"tune: {tune}") + + if node_selector is None: + node_selector = {} + + # Construct the base values dictionary + values = { + "tei": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "tgi": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "data-prep": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "redis-vector-db": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "retriever-usvc": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "llm-uservice": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "embedding-usvc": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "chatqna-ui": {"nodeSelector": {key: value for key, value in node_selector.items()}}, + "global": { + "HUGGINGFACEHUB_API_TOKEN": hf_token, # Use passed token + "modelUseHostPath": model_dir, # Use passed model directory + "extraEnvConfig": "extra-env", # Added MAX_WARMUP_SEQUENCE_LENGTH: 512 to extra-env in deploy.py + }, + "nodeSelector": {key: value for key, value in node_selector.items()}, + } + + if with_rerank: + values["teirerank"] = {"nodeSelector": {key: value for key, value in node_selector.items()}} + values["reranking-usvc"] = {"nodeSelector": {key: value for key, value in node_selector.items()}} + else: + values["image"] = {"repository": "opea/chatqna-without-rerank"} + + default_replicas = [ + {"name": "chatqna", "replicaCount": 2}, + {"name": "tei", "replicaCount": 1}, + {"name": "teirerank", "replicaCount": 1} if with_rerank else None, + {"name": "tgi", "replicaCount": 7 if with_rerank else 8}, + {"name": "data-prep", "replicaCount": 1}, + {"name": "redis-vector-db", "replicaCount": 1}, + {"name": "retriever-usvc", "replicaCount": 2}, + ] + + if num_nodes > 1: + # Scale replicas based on number of nodes + replicas = [ + {"name": "chatqna", "replicaCount": 1 * num_nodes}, + {"name": "tei", "replicaCount": 1 * num_nodes}, + {"name": "teirerank", "replicaCount": 1} if with_rerank else None, + {"name": "tgi", "replicaCount": (8 * num_nodes - 1) if with_rerank else 8 * num_nodes}, + {"name": "data-prep", "replicaCount": 1}, + {"name": "redis-vector-db", "replicaCount": 1}, + {"name": "retriever-usvc", "replicaCount": 1 * num_nodes}, + ] + else: + replicas = default_replicas + + # Remove None values for rerank disabled + replicas = [r for r in replicas if r] + + # Update values.yaml with replicas + for replica in replicas: + service_name = replica["name"] + if service_name == "chatqna": + values["replicaCount"] = replica["replicaCount"] + print(replica["replicaCount"]) + elif service_name in values: + values[service_name]["replicaCount"] = replica["replicaCount"] + + # Prepare resource configurations based on tuning + resources = [] + if tune: + resources = [ + { + "name": "chatqna", + "resources": { + "limits": {"cpu": "16", "memory": "8000Mi"}, + "requests": {"cpu": "16", "memory": "8000Mi"}, + }, + }, + { + "name": "tei", + "resources": { + "limits": {"cpu": "80", "memory": "20000Mi"}, + "requests": {"cpu": "80", "memory": "20000Mi"}, + }, + }, + {"name": "teirerank", "resources": {"limits": {"habana.ai/gaudi": 1}}} if with_rerank else None, + {"name": "tgi", "resources": {"limits": {"habana.ai/gaudi": 1}}}, + {"name": "retriever-usvc", "resources": {"requests": {"cpu": "8", "memory": "8000Mi"}}}, + ] + + # Filter out any None values directly as part of initialization + resources = [r for r in resources if r is not None] + + # Add resources for each service if tuning + for resource in resources: + service_name = resource["name"] + if service_name == "chatqna": + values["resources"] = resource["resources"] + elif service_name in values: + values[service_name]["resources"] = resource["resources"] + + # Add extraCmdArgs for tgi service with default values + if "tgi" in values: + values["tgi"]["extraCmdArgs"] = [ + "--max-input-length", + "1280", + "--max-total-tokens", + "2048", + "--max-batch-total-tokens", + "65536", + "--max-batch-prefill-tokens", + "4096", + ] + + yaml_string = yaml.dump(values, default_flow_style=False) + + # Determine the mode based on the 'tune' parameter + mode = "tuned" if tune else "oob" + + # Determine the filename based on 'with_rerank' and 'num_nodes' + if with_rerank: + filename = f"{mode}_{num_nodes}_gaudi_with_rerank.yaml" + else: + filename = f"{mode}_{num_nodes}_gaudi_without_rerank.yaml" + + # Write the YAML data to the file + with open(filename, "w") as file: + file.write(yaml_string) + + # Get the current working directory and construct the file path + current_dir = os.getcwd() + filepath = os.path.join(current_dir, filename) + + print(f"YAML file {filepath} has been generated.") + return filepath # Optionally return the file path + + +# Main execution for standalone use of create_values_yaml +if __name__ == "__main__": + # Example values for standalone execution + with_rerank = True + num_nodes = 2 + hftoken = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + modeldir = "/mnt/model" + node_selector = {"node-type": "opea-benchmark"} + tune = True + + filename = generate_helm_values(with_rerank, num_nodes, hftoken, modeldir, node_selector, tune) + + # Read back the generated YAML file for verification + with open(filename, "r") as file: + print("Generated YAML contents:") + print(file.read()) diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py index d168d1055f..95318e9613 100644 --- a/ChatQnA/chatqna.py +++ b/ChatQnA/chatqna.py @@ -148,6 +148,8 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di next_data["inputs"] = prompt + elif self.services[cur_node].service_type == ServiceType.LLM and not llm_parameters_dict["streaming"]: + next_data["text"] = data["choices"][0]["message"]["content"] else: next_data = data @@ -166,7 +168,10 @@ def align_generator(self, gen, **kwargs): try: # sometimes yield empty chunk, do a fallback here json_data = json.loads(json_str) - if json_data["choices"][0]["finish_reason"] != "eos_token": + if ( + json_data["choices"][0]["finish_reason"] != "eos_token" + and "content" in json_data["choices"][0]["delta"] + ): yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n" except Exception as e: yield f"data: {repr(json_str.encode('utf-8'))}\n\n" diff --git a/ChatQnA/chatqna.yaml b/ChatQnA/chatqna.yaml index e8a2d27357..0344b28317 100644 --- a/ChatQnA/chatqna.yaml +++ b/ChatQnA/chatqna.yaml @@ -19,7 +19,7 @@ opea_micro_services: tei-embedding-service: host: ${TEI_EMBEDDING_SERVICE_IP} ports: ${TEI_EMBEDDING_SERVICE_PORT} - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 volumes: - "./data:/data" runtime: habana @@ -38,7 +38,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 volumes: - "./data:/data" runtime: habana diff --git a/ChatQnA/chatqna_wrapper.py b/ChatQnA/chatqna_wrapper.py new file mode 100644 index 0000000000..09062b5d27 --- /dev/null +++ b/ChatQnA/chatqna_wrapper.py @@ -0,0 +1,68 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType + +MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0") +MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888)) +EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0") +EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000)) +RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0") +RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000)) +RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0") +RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000)) +LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0") +LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000)) + + +class ChatQnAService: + def __init__(self, host="0.0.0.0", port=8000): + self.host = host + self.port = port + self.megaservice = ServiceOrchestrator() + + def add_remote_service(self): + embedding = MicroService( + name="embedding", + host=EMBEDDING_SERVICE_HOST_IP, + port=EMBEDDING_SERVICE_PORT, + endpoint="/v1/embeddings", + use_remote_service=True, + service_type=ServiceType.EMBEDDING, + ) + retriever = MicroService( + name="retriever", + host=RETRIEVER_SERVICE_HOST_IP, + port=RETRIEVER_SERVICE_PORT, + endpoint="/v1/retrieval", + use_remote_service=True, + service_type=ServiceType.RETRIEVER, + ) + rerank = MicroService( + name="rerank", + host=RERANK_SERVICE_HOST_IP, + port=RERANK_SERVICE_PORT, + endpoint="/v1/reranking", + use_remote_service=True, + service_type=ServiceType.RERANK, + ) + llm = MicroService( + name="llm", + host=LLM_SERVICE_HOST_IP, + port=LLM_SERVICE_PORT, + endpoint="/v1/chat/completions", + use_remote_service=True, + service_type=ServiceType.LLM, + ) + self.megaservice.add(embedding).add(retriever).add(rerank).add(llm) + self.megaservice.flow_to(embedding, retriever) + self.megaservice.flow_to(retriever, rerank) + self.megaservice.flow_to(rerank, llm) + self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port) + + +if __name__ == "__main__": + chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT) + chatqna.add_remote_service() diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md index 4598c07ec0..8396df454f 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md @@ -26,7 +26,6 @@ To set up environment variables for deploying ChatQnA services, follow these ste export http_proxy="Your_HTTP_Proxy" export https_proxy="Your_HTTPs_Proxy" # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" - # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" export no_proxy="Your_No_Proxy",chatqna-xeon-ui-server,chatqna-xeon-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service ``` @@ -48,13 +47,13 @@ docker pull opea/chatqna:latest docker pull opea/chatqna-ui:latest ``` -In following cases, you could build docker image from source by yourself. - -- Failed to download the docker image. +NB: You should build docker image from source by yourself if: -- If you want to use a specific version of Docker image. +- You are developing off the git main branch (as the container's ports in the repo may be different from the published docker image). +- You can't download the docker image. +- You want to use a specific version of Docker image. -Please refer to 'Build Docker Images' in below. +Please refer to ['Build Docker Images'](#🚀-build-docker-images) in below. ## QuickStart: 3.Consume the ChatQnA Service @@ -195,7 +194,7 @@ For users in China who are unable to download models directly from Huggingface, export HF_TOKEN=${your_hf_token} export HF_ENDPOINT="https://hf-mirror.com" model_name="Intel/neural-chat-7b-v3-3" - docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu --model-id $model_name + docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id $model_name ``` 2. Offline @@ -209,7 +208,7 @@ For users in China who are unable to download models directly from Huggingface, ```bash export HF_TOKEN=${your_hf_token} export model_path="/path/to/model" - docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu --model-id /data + docker run -p 8008:80 -v $model_path:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu --model-id /data ``` ### Setup Environment Variables @@ -324,17 +323,17 @@ For details on how to verify the correctness of the response, refer to [how-to-v ```bash # TGI service - curl http://${host_ip}:9009/generate \ + curl http://${host_ip}:9009/v1/chat/completions \ -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` ```bash # vLLM Service - curl http://${host_ip}:9009/v1/completions \ + curl http://${host_ip}:9009/v1/chat/completions \ -H "Content-Type: application/json" \ - -d '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}' + -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' ``` 5. MegaService @@ -433,6 +432,66 @@ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \ -H "Content-Type: application/json" ``` +### Profile Microservices + +To further analyze MicroService Performance, users could follow the instructions to profile MicroServices. + +#### 1. vLLM backend Service + +Users could follow previous section to testing vLLM microservice or ChatQnA MegaService. + By default, vLLM profiling is not enabled. Users could start and stop profiling by following commands. + +##### Start vLLM profiling + +```bash +curl http://${host_ip}:9009/start_profile \ + -H "Content-Type: application/json" \ + -d '{"model": "Intel/neural-chat-7b-v3-3"}' +``` + +Users would see below docker logs from vllm-service if profiling is started correctly. + +```bash +INFO api_server.py:361] Starting profiler... +INFO api_server.py:363] Profiler started. +INFO: x.x.x.x:35940 - "POST /start_profile HTTP/1.1" 200 OK +``` + +After vLLM profiling is started, users could start asking questions and get responses from vLLM MicroService + or ChatQnA MicroService. + +##### Stop vLLM profiling + +By following command, users could stop vLLM profliing and generate a \*.pt.trace.json.gz file as profiling result + under /mnt folder in vllm-service docker instance. + +```bash +# vLLM Service +curl http://${host_ip}:9009/stop_profile \ + -H "Content-Type: application/json" \ + -d '{"model": "Intel/neural-chat-7b-v3-3"}' +``` + +Users would see below docker logs from vllm-service if profiling is stopped correctly. + +```bash +INFO api_server.py:368] Stopping profiler... +INFO api_server.py:370] Profiler stopped. +INFO: x.x.x.x:41614 - "POST /stop_profile HTTP/1.1" 200 OK +``` + +After vllm profiling is stopped, users could use below command to get the \*.pt.trace.json.gz file under /mnt folder. + +```bash +docker cp vllm-service:/mnt/ . +``` + +##### Check profiling result + +Open a web browser and type "chrome://tracing" or "ui.perfetto.dev", and then load the json.gz file, you should be able + to see the vLLM profiling result as below diagram. +![image](https://github.com/user-attachments/assets/55c7097e-5574-41dc-97a7-5e87c31bc286) + ## 🚀 Launch the UI ### Launch with origin port diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md new file mode 100644 index 0000000000..f730a91aea --- /dev/null +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_pinecone.md @@ -0,0 +1,382 @@ +# Build Mega Service of ChatQnA (with Pinecone) on Xeon + +This document outlines the deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Xeon server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank`, and `llm`. We will publish the Docker images to Docker Hub soon, it will simplify the deployment process for this service. + +## 🚀 Apply Xeon Server on AWS + +To apply a Xeon server on AWS, start by creating an AWS account if you don't have one already. Then, head to the [EC2 Console](https://console.aws.amazon.com/ec2/v2/home) to begin the process. Within the EC2 service, select the Amazon EC2 M7i or M7i-flex instance type to leverage the power of 4th Generation Intel Xeon Scalable processors. These instances are optimized for high-performance computing and demanding workloads. + +For detailed information about these instance types, you can refer to this [link](https://aws.amazon.com/ec2/instance-types/m7i/). Once you've chosen the appropriate instance type, proceed with configuring your instance settings, including network configurations, security groups, and storage options. + +After launching your instance, you can connect to it using SSH (for Linux instances) or Remote Desktop Protocol (RDP) (for Windows instances). From there, you'll have full access to your Xeon server, allowing you to install, configure, and manage your applications as needed. + +**Certain ports in the EC2 instance need to opened up in the security group, for the microservices to work with the curl commands** + +> See one example below. Please open up these ports in the EC2 instance based on the IP addresses you want to allow + +``` + +data_prep_service +===================== +Port 6007 - Open to 0.0.0.0/0 +Port 6008 - Open to 0.0.0.0/0 + +tei_embedding_service +===================== +Port 6006 - Open to 0.0.0.0/0 + +embedding +========= +Port 6000 - Open to 0.0.0.0/0 + +retriever +========= +Port 7000 - Open to 0.0.0.0/0 + +tei_xeon_service +================ +Port 8808 - Open to 0.0.0.0/0 + +reranking +========= +Port 8000 - Open to 0.0.0.0/0 + +tgi-service +=========== +Port 9009 - Open to 0.0.0.0/0 + +llm +=== +Port 9000 - Open to 0.0.0.0/0 + +chaqna-xeon-backend-server +========================== +Port 8888 - Open to 0.0.0.0/0 + +chaqna-xeon-ui-server +===================== +Port 5173 - Open to 0.0.0.0/0 +``` + +## 🚀 Build Docker Images + +First of all, you need to build Docker Images locally and install the python package of it. + +```bash +git clone https://github.com/opea-project/GenAIComps.git +cd GenAIComps +``` + +### 1. Build Embedding Image + +```bash +docker build --no-cache -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/tei/langchain/Dockerfile . +``` + +### 2. Build Retriever Image + +```bash +docker build --no-cache -t opea/retriever-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/pinecone/langchain/Dockerfile . +``` + +### 3. Build Rerank Image + +```bash +docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/tei/Dockerfile . +``` + +### 4. Build LLM Image + +```bash +docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile . +``` + +### 5. Build Dataprep Image + +```bash +docker build --no-cache -t opea/dataprep-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/pinecone/langchain/Dockerfile . +cd .. +``` + +### 6. Build MegaService Docker Image + +To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command: + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +cd GenAIExamples/ChatQnA/docker +docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . +cd ../../.. +``` + +### 7. Build UI Docker Image + +Build frontend Docker image via below command: + +```bash +cd GenAIExamples/ChatQnA/docker/ui/ +docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile . +cd ../../../.. +``` + +### 8. Build Conversational React UI Docker Image (Optional) + +Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command: + +**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable** + +```bash +cd GenAIExamples/ChatQnA/docker/ui/ +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna" +export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep" +export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get_file" +docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg DATAPREP_SERVICE_ENDPOINT=$DATAPREP_SERVICE_ENDPOINT --build-arg DATAPREP_GET_FILE_ENDPOINT=$DATAPREP_GET_FILE_ENDPOINT -f ./docker/Dockerfile.react . +cd ../../../.. +``` + +Then run the command `docker images`, you will have the following 7 Docker Images: + +1. `opea/dataprep-pinecone:latest` +2. `opea/embedding-tei:latest` +3. `opea/retriever-pinecone:latest` +4. `opea/reranking-tei:latest` +5. `opea/llm-tgi:latest` +6. `opea/chatqna:latest` +7. `opea/chatqna-ui:latest` + +## 🚀 Start Microservices + +### Setup Environment Variables + +Since the `compose_pinecone.yaml` will consume some environment variables, you need to setup them in advance as below. + +**Export the value of the public IP address of your Xeon server to the `host_ip` environment variable** + +> Change the External_Public_IP below with the actual IPV4 value + +``` +export host_ip="External_Public_IP" +``` + +**Export the value of your Huggingface API token to the `your_hf_api_token` environment variable** + +> Change the Your_Huggingface_API_Token below with tyour actual Huggingface API Token value + +``` +export your_hf_api_token="Your_Huggingface_API_Token" +``` + +**Append the value of the public IP address to the no_proxy list** + +``` +export your_no_proxy=${your_no_proxy},"External_Public_IP" +``` + +\*\*Get the PINECONE_API_KEY and the INDEX_NAME + +``` +export pinecone_api_key=${api_key} +export pinecone_index_name=${pinecone_index} +``` + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" +export RERANK_MODEL_ID="BAAI/bge-reranker-base" +export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" +export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006" +export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808" +export TGI_LLM_ENDPOINT="http://${host_ip}:9009" +export PINECONE_API_KEY=${pinecone_api_key} +export PINECONE_INDEX_NAME=${pinecone_index_name} +export INDEX_NAME=${pinecone_index_name} +export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} +export MEGA_SERVICE_HOST_IP=${host_ip} +export EMBEDDING_SERVICE_HOST_IP=${host_ip} +export RETRIEVER_SERVICE_HOST_IP=${host_ip} +export RERANK_SERVICE_HOST_IP=${host_ip} +export LLM_SERVICE_HOST_IP=${host_ip} +export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna" +export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep" +export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6008/v1/dataprep/get_file" +export DATAPREP_DELETE_FILE_ENDPOINT="http://${host_ip}:6009/v1/dataprep/delete_file" +``` + +Note: Please replace with `host_ip` with you external IP address, do not use localhost. + +### Start all the services Docker Containers + +> Before running the docker compose command, you need to be in the folder that has the docker compose yaml file + +```bash +cd GenAIExamples/ChatQnA/docker/xeon/ +docker compose -f compose_pinecone.yaml up -d +``` + +### Validate Microservices + +1. TEI Embedding Service + +```bash +curl ${host_ip}:6006/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +2. Embedding Microservice + +```bash +curl http://${host_ip}:6000/v1/embeddings\ + -X POST \ + -d '{"text":"hello"}' \ + -H 'Content-Type: application/json' +``` + +3. Retriever Microservice + To validate the retriever microservice, you need to generate a mock embedding vector of length 768 in Python script: + +```Python +import random +embedding = [random.uniform(-1, 1) for _ in range(768)] +print(embedding) +``` + +Then substitute your mock embedding vector for the `${your_embedding}` in the following cURL command: + +```bash +curl http://${host_ip}:7000/v1/retrieval \ + -X POST \ + -d '{"text":"What is the revenue of Nike in 2023?","embedding":"'"${your_embedding}"'"}' \ + -H 'Content-Type: application/json' +``` + +4. TEI Reranking Service + +```bash +curl http://${host_ip}:8808/rerank \ + -X POST \ + -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' \ + -H 'Content-Type: application/json' +``` + +5. Reranking Microservice + +```bash +curl http://${host_ip}:8000/v1/reranking\ + -X POST \ + -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \ + -H 'Content-Type: application/json' +``` + +6. TGI Service + +```bash +curl http://${host_ip}:9009/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -H 'Content-Type: application/json' +``` + +7. LLM Microservice + +```bash +curl http://${host_ip}:9000/v1/chat/completions\ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -H 'Content-Type: application/json' +``` + +8. MegaService + +```bash +curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{ + "messages": "What is the revenue of Nike in 2023?" + }' +``` + +9. Dataprep Microservice(Optional) + +If you want to update the default knowledge base, you can use the following commands: + +Update Knowledge Base via Local File Upload: + +```bash +curl -X POST "http://${host_ip}:6007/v1/dataprep" \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./nke-10k-2023.pdf" +``` + +This command updates a knowledge base by uploading a local file for processing. Update the file path according to your environment. + +Add Knowledge Base via HTTP Links: + +```bash +curl -X POST "http://${host_ip}:6007/v1/dataprep" \ + -H "Content-Type: multipart/form-data" \ + -F 'link_list=["https://opea.dev"]' +``` + +This command updates a knowledge base by submitting a list of HTTP links for processing. + +Also, you are able to get the file list that you uploaded: + +```bash +curl -X POST "http://${host_ip}:6008/v1/dataprep/get_file" \ + -H "Content-Type: application/json" +``` + +## Enable LangSmith for Monotoring Application (Optional) + +LangSmith offers tools to debug, evaluate, and monitor language models and intelligent agents. It can be used to assess benchmark data for each microservice. Before launching your services with `docker compose -f compose_pinecone.yaml up -d`, you need to enable LangSmith tracing by setting the `LANGCHAIN_TRACING_V2` environment variable to true and configuring your LangChain API key. + +Here's how you can do it: + +1. Install the latest version of LangSmith: + +```bash +pip install -U langsmith +``` + +2. Set the necessary environment variables: + +```bash +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=ls_... +``` + +## 🚀 Launch the UI + +To access the frontend, open the following URL in your browser: http://{host_ip}:5173. By default, the UI runs on port 5173 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below: + +```yaml + chaqna-gaudi-ui-server: + image: opea/chatqna-ui:latest + ... + ports: + - "80:5173" +``` + +## 🚀 Launch the Conversational UI (react) + +To access the Conversational UI frontend, open the following URL in your browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If you prefer to use a different host port to access the frontend, you can modify the port mapping in the `compose.yaml` file as shown below: + +```yaml + chaqna-xeon-conversation-ui-server: + image: opea/chatqna-conversation-ui:latest + ... + ports: + - "80:80" +``` + +![project-screenshot](../../../../assets/img/chat_ui_init.png) + +Here is an example of running ChatQnA: + +![project-screenshot](../../../../assets/img/chat_ui_response.png) + +Here is an example of running ChatQnA with Conversational UI (React): + +![project-screenshot](../../../../assets/img/conversation_ui_response.png) diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md index 1adfe8cf17..2f9fa1b822 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md +++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md @@ -252,9 +252,9 @@ For details on how to verify the correctness of the response, refer to [how-to-v Then try the `cURL` command below to validate TGI. ```bash - curl http://${host_ip}:6042/generate \ + curl http://${host_ip}:6042/v1/chat/completions \ -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \ + -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml index 14794e8d4f..0c290b8683 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml @@ -73,7 +73,7 @@ services: HF_HUB_ENABLE_HF_TRANSFER: 0 command: --model-id ${RERANK_MODEL_ID} --auto-truncate tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "9009:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml similarity index 52% rename from ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml rename to ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml index a1019c9ac1..f42fd6fd2d 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_pinecone.yaml @@ -1,142 +1,121 @@ + # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +version: "3.8" + services: - redis-vector-db: - image: redis/redis-stack:7.2.0-v9 - container_name: redis-vector-db - ports: - - "6379:6379" - - "8001:8001" - dataprep-redis-service: - image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest} - container_name: dataprep-redis-server + dataprep-pinecone-service: + image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest} + container_name: dataprep-pinecone-server depends_on: - - redis-vector-db - tei-embedding-service ports: - "6007:6007" + - "6008:6008" + - "6009:6009" environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - REDIS_URL: redis://redis-vector-db:6379 - REDIS_HOST: redis-vector-db - INDEX_NAME: ${INDEX_NAME} - TEI_ENDPOINT: http://tei-embedding-service:80 + PINECONE_API_KEY: ${PINECONE_API_KEY} + PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME} + TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80 + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ghcr.io/huggingface/tei-gaudi:latest - container_name: tei-embedding-gaudi-server + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server ports: - - "8090:80" + - "6006:80" volumes: - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate retriever: - image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} - container_name: retriever-redis-server - depends_on: - - redis-vector-db + image: ${REGISTRY:-opea}/retriever-pinecone:${TAG:-latest} + container_name: retriever-pinecone-server ports: - "7000:7000" ipc: host environment: - no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - REDIS_URL: redis://redis-vector-db:6379 - REDIS_HOST: redis-vector-db - INDEX_NAME: ${INDEX_NAME} + PINECONE_API_KEY: ${PINECONE_API_KEY} + INDEX_NAME: ${PINECONE_INDEX_NAME} + PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80 HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/tei-gaudi:latest - container_name: tei-reranking-gaudi-server + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-reranking-server ports: - "8808:80" volumes: - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${RERANK_MODEL_ID} --auto-truncate - vllm-ray-service: - image: ${REGISTRY:-opea}/llm-vllm-ray-hpu:${TAG:-latest} - container_name: vllm-ray-gaudi-server + tgi-service: + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu + container_name: tgi-service ports: - - "8006:8000" + - "9009:80" volumes: - "./data:/data" + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - LLM_MODEL_ID: ${LLM_MODEL_ID} - runtime: habana - cap_add: - - SYS_NICE - ipc: host - command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True" - chatqna-gaudi-backend-server: + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0 + chatqna-xeon-backend-server: image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} - container_name: chatqna-gaudi-backend-server + container_name: chatqna-xeon-backend-server depends_on: - - redis-vector-db - tei-embedding-service + - dataprep-pinecone-service - retriever - tei-reranking-service - - vllm-ray-service + - tgi-service ports: - "8888:8888" environment: - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - - MEGA_SERVICE_HOST_IP=chatqna-gaudi-backend-server + - MEGA_SERVICE_HOST_IP=chatqna-xeon-backend-server - EMBEDDING_SERVER_HOST_IP=tei-embedding-service - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80} - RETRIEVER_SERVICE_HOST_IP=retriever - RERANK_SERVER_HOST_IP=tei-reranking-service - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80} - - LLM_SERVER_HOST_IP=vllm-ray-service - - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8000} - - LLM_MODEL=${LLM_MODEL_ID} + - LLM_SERVER_HOST_IP=tgi-service + - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80} - LOGFLAG=${LOGFLAG} + - LLM_MODEL=${LLM_MODEL_ID} ipc: host restart: always - chatqna-gaudi-ui-server: + chatqna-xeon-ui-server: image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest} - container_name: chatqna-gaudi-ui-server + container_name: chatqna-xeon-ui-server depends_on: - - chatqna-gaudi-backend-server + - chatqna-xeon-backend-server ports: - "5173:5173" environment: @@ -145,24 +124,24 @@ services: - http_proxy=${http_proxy} ipc: host restart: always - chatqna-gaudi-nginx-server: + chatqna-xeon-nginx-server: image: ${REGISTRY:-opea}/nginx:${TAG:-latest} - container_name: chatqna-gaudi-nginx-server + container_name: chatqna-xeon-nginx-server depends_on: - - chatqna-gaudi-backend-server - - chatqna-gaudi-ui-server + - chatqna-xeon-backend-server + - chatqna-xeon-ui-server ports: - "${NGINX_PORT:-80}:80" environment: - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - - FRONTEND_SERVICE_IP=chatqna-gaudi-ui-server + - FRONTEND_SERVICE_IP=chatqna-xeon-ui-server - FRONTEND_SERVICE_PORT=5173 - BACKEND_SERVICE_NAME=chatqna - - BACKEND_SERVICE_IP=chatqna-gaudi-backend-server + - BACKEND_SERVICE_IP=chatqna-xeon-backend-server - BACKEND_SERVICE_PORT=8888 - - DATAPREP_SERVICE_IP=dataprep-redis-service + - DATAPREP_SERVICE_IP=dataprep-pinecone-service - DATAPREP_SERVICE_PORT=6007 ipc: host restart: always diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml index 8d37bb83af..ad7df8fa79 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml @@ -72,7 +72,7 @@ services: HF_HUB_ENABLE_HF_TRANSFER: 0 command: --model-id ${RERANK_MODEL_ID} --auto-truncate tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "6042:80" diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml index 6e9d9ac200..3735b75f04 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml @@ -86,6 +86,7 @@ services: https_proxy: ${https_proxy} HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} LLM_MODEL_ID: ${LLM_MODEL_ID} + VLLM_TORCH_PROFILER_DIR: "/mnt" command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 chatqna-xeon-backend-server: image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml index e497985f8a..938a6690d3 100644 --- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml +++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml @@ -57,7 +57,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "9009:80" diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh old mode 100644 new mode 100755 diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md index 43aa720f02..ad56d525a4 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md @@ -26,7 +26,7 @@ To set up environment variables for deploying ChatQnA services, follow these ste export http_proxy="Your_HTTP_Proxy" export https_proxy="Your_HTTPs_Proxy" # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" - export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,vllm-ray-service,guardrails + export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,guardrails ``` 3. Set up other environment variables: @@ -103,7 +103,7 @@ docker build -t opea/guardrails-tgi:latest --build-arg https_proxy=$https_proxy ```bash git clone https://github.com/opea-project/GenAIExamples.git - cd GenAIExamples/ChatQnA/docker + cd GenAIExamples/ChatQnA docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . ``` @@ -123,7 +123,7 @@ docker build -t opea/guardrails-tgi:latest --build-arg https_proxy=$https_proxy ```bash git clone https://github.com/opea-project/GenAIExamples.git - cd GenAIExamples/ChatQnA/docker + cd GenAIExamples/ChatQnA docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank . ``` @@ -192,7 +192,7 @@ For users in China who are unable to download models directly from Huggingface, export HF_TOKEN=${your_hf_token} export HF_ENDPOINT="https://hf-mirror.com" model_name="Intel/neural-chat-7b-v3-3" - docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_name --max-input-tokens 1024 --max-total-tokens 2048 + docker run -p 8008:80 -v ./data:/data --name tgi-service -e HF_ENDPOINT=$HF_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id $model_name --max-input-tokens 1024 --max-total-tokens 2048 ``` 2. Offline @@ -206,7 +206,7 @@ For users in China who are unable to download models directly from Huggingface, ```bash export HF_TOKEN=${your_hf_token} export model_path="/path/to/model" - docker run -p 8008:80 -v $model_path:/data --name tgi_service --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id /data --max-input-tokens 1024 --max-total-tokens 2048 + docker run -p 8008:80 -v $model_path:/data --name tgi_service --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e HUGGING_FACE_HUB_TOKEN=$HF_TOKEN -e ENABLE_HPU_GRAPH=true -e LIMIT_HPU_GRAPH=true -e USE_FLASH_ATTENTION=true -e FLASH_ATTENTION_RECOMPUTE=true --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.6 --model-id /data --max-input-tokens 1024 --max-total-tokens 2048 ``` ### Setup Environment Variables @@ -227,7 +227,7 @@ For users in China who are unable to download models directly from Huggingface, export http_proxy="Your_HTTP_Proxy" export https_proxy="Your_HTTPs_Proxy" # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" - export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,vllm-ray-service,guardrails + export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,guardrails ``` 3. Set up other environment variables: @@ -257,12 +257,6 @@ If use vllm for llm backend. docker compose -f compose_vllm.yaml up -d ``` -If use vllm-on-ray for llm backend. - -```bash -docker compose -f compose_vllm_ray.yaml up -d -``` - If you want to enable guardrails microservice in the pipeline, please follow the below command instead: ```bash @@ -332,30 +326,18 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid Then try the `cURL` command below to validate services. ```bash - #TGI Service - curl http://${host_ip}:8005/generate \ + # TGI service + curl http://${host_ip}:9009/v1/chat/completions \ -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` ```bash - #vLLM Service - curl http://${host_ip}:8007/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "${LLM_MODEL_ID}", - "prompt": "What is Deep Learning?", - "max_tokens": 32, - "temperature": 0 - }' - ``` - - ```bash - #vLLM-on-Ray Service - curl http://${host_ip}:8006/v1/chat/completions \ + # vLLM Service + curl http://${host_ip}:9009/v1/chat/completions \ -H "Content-Type: application/json" \ - -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' + -d '{"model": ${LLM_MODEL_ID}, "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' ``` 5. MegaService diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml index e34f072b5c..170ab54353 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml @@ -26,25 +26,17 @@ services: TEI_ENDPOINT: http://tei-embedding-service:80 HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-embedding-gaudi-server ports: - "8090:80" volumes: - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate retriever: image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} @@ -65,7 +57,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 container_name: tei-reranking-gaudi-server ports: - "8808:80" @@ -86,7 +78,7 @@ services: MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${RERANK_MODEL_ID} --auto-truncate tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8005:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml index 7b9d391fea..7bebade290 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml @@ -26,7 +26,7 @@ services: TEI_ENDPOINT: http://tei-embedding-service:80 HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tgi-guardrails-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-guardrails-server ports: - "8088:80" @@ -65,25 +65,17 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-embedding-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-embedding-gaudi-server ports: - "8090:80" volumes: - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate retriever: image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} @@ -104,7 +96,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 container_name: tei-reranking-gaudi-server ports: - "8808:80" @@ -125,7 +117,7 @@ services: MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${RERANK_MODEL_ID} --auto-truncate tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8008:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml index cd6ef69ec5..bfbbb9570b 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml @@ -26,25 +26,17 @@ services: TEI_ENDPOINT: http://tei-embedding-service:80 HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-embedding-gaudi-server ports: - "8090:80" volumes: - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate retriever: image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} @@ -65,7 +57,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 container_name: tei-reranking-gaudi-server ports: - "8808:80" @@ -86,7 +78,7 @@ services: MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${RERANK_MODEL_ID} --auto-truncate vllm-service: - image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest} + image: ${REGISTRY:-opea}/vllm-hpu:${TAG:-latest} container_name: vllm-gaudi-server ports: - "8007:80" @@ -104,7 +96,7 @@ services: cap_add: - SYS_NICE ipc: host - command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" + command: --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 chatqna-gaudi-backend-server: image: ${REGISTRY:-opea}/chatqna:${TAG:-latest} container_name: chatqna-gaudi-backend-server diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml index 7c2323157f..524b44c1a0 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml @@ -26,25 +26,17 @@ services: TEI_ENDPOINT: http://tei-embedding-service:80 HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 container_name: tei-embedding-gaudi-server ports: - "8090:80" volumes: - "./data:/data" - runtime: habana - cap_add: - - SYS_NICE - ipc: host + shm_size: 1g environment: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - HF_HUB_DISABLE_PROGRESS_BARS: 1 - HF_HUB_ENABLE_HF_TRANSFER: 0 - HABANA_VISIBLE_DEVICES: all - OMPI_MCA_btl_vader_single_copy_mechanism: none - MAX_WARMUP_SEQUENCE_LENGTH: 512 command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate retriever: image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} @@ -65,7 +57,7 @@ services: HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} restart: unless-stopped tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8005:80" diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md index 7448ae625c..d9684e9dbd 100644 --- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md +++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md @@ -48,16 +48,16 @@ f810f3b4d329 opea/embedding-tei:latest "python e 2fa17d84605f opea/dataprep-redis:latest "python prepare_doc_…" 2 minutes ago Up 2 minutes 0.0.0.0:6007->6007/tcp, :::6007->6007/tcp dataprep-redis-server 69e1fb59e92c opea/retriever-redis:latest "/home/user/comps/re…" 2 minutes ago Up 2 minutes 0.0.0.0:7000->7000/tcp, :::7000->7000/tcp retriever-redis-server 313b9d14928a opea/reranking-tei:latest "python reranking_te…" 2 minutes ago Up 2 minutes 0.0.0.0:8000->8000/tcp, :::8000->8000/tcp reranking-tei-gaudi-server -05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.5 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server -174bd43fa6b5 ghcr.io/huggingface/tei-gaudi:latest "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8090->80/tcp, :::8090->80/tcp tei-embedding-gaudi-server +174bd43fa6b5 ghcr.io/huggingface/tei-gaudi:1.5.0 "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8090->80/tcp, :::8090->80/tcp tei-embedding-gaudi-server +05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.6 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server 74084469aa33 redis/redis-stack:7.2.0-v9 "/entrypoint.sh" 2 minutes ago Up 2 minutes 0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp redis-vector-db 88399dbc9e43 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 2 minutes ago Up 2 minutes 0.0.0.0:8808->80/tcp, :::8808->80/tcp tei-reranking-gaudi-server ``` -In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.5` Existed. +In this case, `ghcr.io/huggingface/tgi-gaudi:2.0.6` Existed. ``` -05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.5 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server +05c40b636239 ghcr.io/huggingface/tgi-gaudi:2.0.6 "text-generation-lau…" 2 minutes ago Exited (1) About a minute ago tgi-gaudi-server ``` Next we can check the container logs to get to know what happened during the docker start. @@ -68,7 +68,7 @@ Check the log of container by: `docker logs -t` -View the logs of `ghcr.io/huggingface/tgi-gaudi:2.0.5` +View the logs of `ghcr.io/huggingface/tgi-gaudi:2.0.6` `docker logs 05c40b636239 -t` @@ -97,7 +97,7 @@ So just make sure the devices are available. Here is another failure example: ``` -f7a08f9867f9 ghcr.io/huggingface/tgi-gaudi:2.0.5 "text-generation-lau…" 16 seconds ago Exited (2) 14 seconds ago tgi-gaudi-server +f7a08f9867f9 ghcr.io/huggingface/tgi-gaudi:2.0.6 "text-generation-lau…" 16 seconds ago Exited (2) 14 seconds ago tgi-gaudi-server ``` Check the log by `docker logs f7a08f9867f9 -t`. @@ -114,7 +114,7 @@ View the docker input parameters in `./ChatQnA/docker_compose/intel/hpu/gaudi/co ``` tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8008:80" diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md index 5cd8d3ef08..fc647a5552 100644 --- a/ChatQnA/docker_compose/nvidia/gpu/README.md +++ b/ChatQnA/docker_compose/nvidia/gpu/README.md @@ -17,8 +17,6 @@ To set up environment variables for deploying ChatQnA services, follow these ste ```bash # Example: host_ip="192.168.1.1" export host_ip="External_Public_IP" - # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" - export no_proxy="Your_No_Proxy" export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token" ``` @@ -27,6 +25,8 @@ To set up environment variables for deploying ChatQnA services, follow these ste ```bash export http_proxy="Your_HTTP_Proxy" export https_proxy="Your_HTTPs_Proxy" + # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" + export no_proxy="Your_No_Proxy",chatqna-ui-server,chatqna-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service ``` 3. Set up other environment variables: @@ -95,9 +95,9 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op ```bash git clone https://github.com/opea-project/GenAIExamples.git -cd GenAIExamples/ChatQnA/docker +cd GenAIExamples/ChatQnA docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . -cd ../../.. +cd ../.. ``` ### 5. Build UI Docker Image @@ -107,7 +107,7 @@ Construct the frontend Docker image using the command below: ```bash cd GenAIExamples/ChatQnA/ui docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile . -cd ../../../.. +cd ../../../ ``` ### 6. Build React UI Docker Image (Optional) @@ -117,7 +117,7 @@ Construct the frontend Docker image using the command below: ```bash cd GenAIExamples/ChatQnA/ui docker build --no-cache -t opea/chatqna-react-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react . -cd ../../../.. +cd ../../.. ``` ### 7. Build Nginx Docker Image @@ -156,8 +156,6 @@ Change the `xxx_MODEL_ID` below for your needs. ```bash # Example: host_ip="192.168.1.1" export host_ip="External_Public_IP" - # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" - export no_proxy="Your_No_Proxy" export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token" # Example: NGINX_PORT=80 export NGINX_PORT=${your_nginx_port} @@ -168,6 +166,8 @@ Change the `xxx_MODEL_ID` below for your needs. ```bash export http_proxy="Your_HTTP_Proxy" export https_proxy="Your_HTTPs_Proxy" + # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1" + export no_proxy="Your_No_Proxy",chatqna-ui-server,chatqna-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service ``` 3. Set up other environment variables: @@ -238,9 +238,9 @@ docker compose up -d Then try the `cURL` command below to validate TGI. ```bash - curl http://${host_ip}:8008/generate \ + curl http://${host_ip}:9009/v1/chat/completions \ -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \ + -d '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \ -H 'Content-Type: application/json' ``` diff --git a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml index c35866b101..ba504c2eb3 100644 --- a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml +++ b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml @@ -20,10 +20,10 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - REDIS_URL: ${REDIS_URL} - REDIS_HOST: ${REDIS_HOST} + REDIS_URL: redis://redis-vector-db:6379 + REDIS_HOST: redis-vector-db INDEX_NAME: ${INDEX_NAME} - TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} + TEI_ENDPOINT: http://tei-embedding-service:80 HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 @@ -39,13 +39,6 @@ services: http_proxy: ${http_proxy} https_proxy: ${https_proxy} command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] retriever: image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest} container_name: retriever-redis-server @@ -58,12 +51,13 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} - REDIS_URL: ${REDIS_URL} + REDIS_URL: redis://redis-vector-db:6379 + REDIS_HOST: redis-vector-db INDEX_NAME: ${INDEX_NAME} TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} restart: unless-stopped tei-reranking-service: - image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + image: ghcr.io/huggingface/text-embeddings-inference:1.5 container_name: tei-reranking-server ports: - "8808:80" @@ -123,11 +117,14 @@ services: - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} - - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP} - - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP} - - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP} - - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP} + - MEGA_SERVICE_HOST_IP=chaqna-backend-server + - EMBEDDING_SERVER_HOST_IP=tei-embedding-service + - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80} + - RETRIEVER_SERVICE_HOST_IP=retriever + - RERANK_SERVER_HOST_IP=tei-reranking-service + - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80} + - LLM_SERVER_HOST_IP=tgi-service + - LLM_SERVER_PORT=${LLM_SERVER_PORT:-80} ipc: host restart: always chaqna-ui-server: diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml index 3902313208..7be5141ead 100644 --- a/ChatQnA/docker_image_build/build.yaml +++ b/ChatQnA/docker_image_build/build.yaml @@ -53,6 +53,12 @@ services: dockerfile: comps/retrievers/qdrant/haystack/Dockerfile extends: chatqna image: ${REGISTRY:-opea}/retriever-qdrant:${TAG:-latest} + retriever-pinecone: + build: + context: GenAIComps + dockerfile: comps/retrievers/pinecone/langchain/Dockerfile + extends: chatqna + image: ${REGISTRY:-opea}/retriever-pinecone:${TAG:-latest} reranking-tei: build: context: GenAIComps @@ -77,24 +83,6 @@ services: dockerfile: comps/llms/text-generation/vllm/langchain/Dockerfile extends: chatqna image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest} - llm-vllm-hpu: - build: - context: GenAIComps - dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu - extends: chatqna - image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest} - llm-vllm-ray: - build: - context: GenAIComps - dockerfile: comps/llms/text-generation/vllm/ray/Dockerfile - extends: chatqna - image: ${REGISTRY:-opea}/llm-vllm-ray:${TAG:-latest} - llm-vllm-ray-hpu: - build: - context: GenAIComps - dockerfile: comps/llms/text-generation/vllm/ray/dependency/Dockerfile - extends: chatqna - image: ${REGISTRY:-opea}/llm-vllm-ray-hpu:${TAG:-latest} dataprep-redis: build: context: GenAIComps @@ -107,6 +95,12 @@ services: dockerfile: comps/dataprep/qdrant/langchain/Dockerfile extends: chatqna image: ${REGISTRY:-opea}/dataprep-qdrant:${TAG:-latest} + dataprep-pinecone: + build: + context: GenAIComps + dockerfile: comps/dataprep/pinecone/langchain/Dockerfile + extends: chatqna + image: ${REGISTRY:-opea}/dataprep-pinecone:${TAG:-latest} guardrails-tgi: build: context: GenAIComps @@ -119,6 +113,12 @@ services: dockerfile: Dockerfile.cpu extends: chatqna image: ${REGISTRY:-opea}/vllm:${TAG:-latest} + vllm-hpu: + build: + context: vllm-fork + dockerfile: Dockerfile.hpu + extends: chatqna + image: ${REGISTRY:-opea}/vllm-hpu:${TAG:-latest} nginx: build: context: GenAIComps diff --git a/ChatQnA/kubernetes/intel/README_gmc.md b/ChatQnA/kubernetes/intel/README_gmc.md index dab86381fe..2c849c5079 100644 --- a/ChatQnA/kubernetes/intel/README_gmc.md +++ b/ChatQnA/kubernetes/intel/README_gmc.md @@ -18,14 +18,15 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment - tei_embedding_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - retriever: opea/retriever-redis:latest - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 -- tgi-service: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu +- tgi-service: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu - chaqna-xeon-backend-server: opea/chatqna:latest Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services. For Gaudi: -- tei-embedding-service: ghcr.io/huggingface/tei-gaudi:latest -- tgi-service: gghcr.io/huggingface/tgi-gaudi:2.0.5 +tei-embedding-service: ghcr.io/huggingface/tei-gaudi:1.5.0 +tgi-service: gghcr.io/huggingface/tgi-gaudi:2.0.6 + > [NOTE] > Please refer to [Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/docker_compose/intel/cpu/xeon/README.md) or [Gaudi README](https://github.com/opea-project/GenAIExamples/blob/main/ChatQnA/docker_compose/intel/hpu/gaudi/README.md) to build the OPEA images. These too will be available on Docker Hub soon to simplify use. diff --git a/ChatQnA/kubernetes/intel/README_single_node.md b/ChatQnA/kubernetes/intel/README_single_node.md new file mode 100644 index 0000000000..411f4cfdd6 --- /dev/null +++ b/ChatQnA/kubernetes/intel/README_single_node.md @@ -0,0 +1,53 @@ +# Deploy ChatQnA in Kubernetes Cluster on Single Node environment (Minikube) + +The following instructions are to deploy the ChatQnA example on a single Node using Kubernetes for testing purposes. +## Minikube setup +1. Install [Minikube](https://minikube.sigs.k8s.io/docs/start/) following the quickstart guide +2. Install [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) +3. Build the container images, following the steps under "Build Docker Images" section in the [docker-compose README](../../docker_compose/intel/cpu/xeon/README.md) to checkout [GenAIComps](https://github.com/opea-project/GenAIComps.git) and build other images with your changes for development. +```bash +# Example on building frontend Docker image +cd GenAIExamples/ChatQnA/ui +docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile . +# etc... +``` +The built images should be visible in the local Docker registry. Other images which have not been built with your changes (or not present in your local Docker registry) will be pulled from [docker hub](https://hub.docker.com/u/opea) by Minikube later in step 6. +```bash +docker images | grep opea +# REPOSITORY TAG IMAGE ID CREATED SIZE +# opea/chatqna-ui latest 8f2fa2523b85 6 days ago 1.56GB +# opea/chatqna latest 7f2602a7a266 6 days ago 821MB +# ... +``` +4. The built images must be imported into the Minikube registry from the local Docker registry. This can be done using `minikube load `image. +```bash +minikube image load opea/chatqna +minikube image load opea/chatqna-ui +# etc... +``` +5. Start the minikube cluster with `minikube start`, check that the minikube container (kicbase) is up with `docker ps` +```bash +docker ps +# CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +# de088666cef2 gcr.io/k8s-minikube/kicbase:v0.0.45 "/usr/local/bin/entr…" 2 days ago Up 2 days 127.0.0.1:49157->22/tcp... minikube +``` +6. Deploy the ChatQnA application with `minikube apply -f chatqna.yaml`, check that the opea pods are in a running state with `kubectl get pods` +```bash +kubectl get pods +# NAME READY STATUS RESTARTS AGE +# chatqna-78b4f5865-qbzms 1/1 Running 0 2d3h +# chatqna-chatqna-ui-54c8dfb6cf-fll5g 1/1 Running 0 2d3h +# etc... +``` + +7. Forward the port of the chatqna service from Minikube to the host, and test the service as you would a normal k8s cluster deployment +```bash +# port-forward to expose the chatqna endpoint from within the minikube cluster +kubectl port-forward svc/chatqna 8888:8888 +curl http://localhost:8888/v1/chatqna \ + -H 'Content-Type: application/json' \ + -d '{"messages": "What is the revenue of Nike in 2023?"}' + +# Similarly port-forward to expose the chatqna-ui endpoint and use the UI at :5173 in your browser +kubectl port-forward svc/chatqna-chatqna-ui 5173:5173 +``` diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml index 08752a8da6..7265ebff5d 100644 --- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml +++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml @@ -554,7 +554,7 @@ spec: securityContext: {} image: "opea/chatqna-ui:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: ui containerPort: 5173 @@ -612,7 +612,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/dataprep-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: data-prep containerPort: 6007 @@ -687,7 +687,7 @@ spec: seccompProfile: type: RuntimeDefault image: "redis/redis-stack:7.2.0-v9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: data-volume @@ -762,7 +762,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/guardrails-tgi:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: guardrails-usvc containerPort: 9090 @@ -840,7 +840,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/retriever-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: retriever-usvc containerPort: 7000 @@ -919,7 +919,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -1010,7 +1010,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -1100,8 +1100,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" - imagePullPolicy: IfNotPresent + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -1180,8 +1180,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" - imagePullPolicy: IfNotPresent + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -1252,18 +1252,12 @@ spec: env: - name: LLM_SERVER_HOST_IP value: chatqna-tgi - - name: LLM_SERVER_PORT - value: "2080" - name: RERANK_SERVER_HOST_IP value: chatqna-teirerank - - name: RERANK_SERVER_PORT - value: "2082" - name: RETRIEVER_SERVICE_HOST_IP value: chatqna-retriever-usvc - name: EMBEDDING_SERVER_HOST_IP value: chatqna-tei - - name: EMBEDDING_SERVER_PORT - value: "2081" - name: GUARDRAIL_SERVICE_HOST_IP value: chatqna-guardrails-usvc - name: GUARDRAIL_SERVICE_PORT @@ -1279,7 +1273,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/chatqna-guardrails:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /tmp name: tmp @@ -1320,7 +1314,7 @@ spec: spec: containers: - image: nginx:1.27.1 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always name: nginx volumeMounts: - mountPath: /etc/nginx/conf.d diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml index 22155dfad6..26813816ed 100644 --- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml +++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml @@ -454,7 +454,7 @@ spec: securityContext: {} image: "opea/chatqna-ui:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: ui containerPort: 5173 @@ -512,7 +512,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/dataprep-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: data-prep containerPort: 6007 @@ -587,7 +587,7 @@ spec: seccompProfile: type: RuntimeDefault image: "redis/redis-stack:7.2.0-v9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: data-volume @@ -662,7 +662,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/retriever-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: retriever-usvc containerPort: 7000 @@ -741,7 +741,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -832,7 +832,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -922,8 +922,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" - imagePullPolicy: IfNotPresent + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -994,18 +994,12 @@ spec: env: - name: LLM_SERVER_HOST_IP value: chatqna-tgi - - name: LLM_SERVER_PORT - value: "2080" - name: RERANK_SERVER_HOST_IP value: chatqna-teirerank - - name: RERANK_SERVER_PORT - value: "2082" - name: RETRIEVER_SERVICE_HOST_IP value: chatqna-retriever-usvc - name: EMBEDDING_SERVER_HOST_IP value: chatqna-tei - - name: EMBEDDING_SERVER_PORT - value: "2081" securityContext: allowPrivilegeEscalation: false capabilities: @@ -1017,7 +1011,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/chatqna:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /tmp name: tmp @@ -1058,7 +1052,7 @@ spec: spec: containers: - image: nginx:1.27.1 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always name: nginx volumeMounts: - mountPath: /etc/nginx/conf.d diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml index 9eba55f9f9..aac57140b7 100644 --- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml +++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml @@ -455,7 +455,7 @@ spec: securityContext: {} image: "opea/chatqna-ui:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: ui containerPort: 5173 @@ -513,7 +513,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/dataprep-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: data-prep containerPort: 6007 @@ -588,7 +588,7 @@ spec: seccompProfile: type: RuntimeDefault image: "redis/redis-stack:7.2.0-v9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: data-volume @@ -663,7 +663,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/retriever-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: retriever-usvc containerPort: 7000 @@ -742,7 +742,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -833,7 +833,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -925,8 +925,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" - imagePullPolicy: IfNotPresent + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -997,18 +997,12 @@ spec: env: - name: LLM_SERVER_HOST_IP value: chatqna-tgi - - name: LLM_SERVER_PORT - value: "2080" - name: RERANK_SERVER_HOST_IP value: chatqna-teirerank - - name: RERANK_SERVER_PORT - value: "2082" - name: RETRIEVER_SERVICE_HOST_IP value: chatqna-retriever-usvc - name: EMBEDDING_SERVER_HOST_IP value: chatqna-tei - - name: EMBEDDING_SERVER_PORT - value: "2081" securityContext: allowPrivilegeEscalation: false capabilities: @@ -1020,7 +1014,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/chatqna:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /tmp name: tmp @@ -1061,7 +1055,7 @@ spec: spec: containers: - image: nginx:1.27.1 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always name: nginx volumeMounts: - mountPath: /etc/nginx/conf.d diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml index dd4ec145d4..a802889f8b 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml @@ -556,7 +556,7 @@ spec: securityContext: {} image: "opea/chatqna-ui:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: ui containerPort: 5173 @@ -614,7 +614,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/dataprep-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: data-prep containerPort: 6007 @@ -692,7 +692,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/guardrails-tgi:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: guardrails-usvc containerPort: 9090 @@ -767,7 +767,7 @@ spec: seccompProfile: type: RuntimeDefault image: "redis/redis-stack:7.2.0-v9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: data-volume @@ -842,7 +842,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/retriever-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: retriever-usvc containerPort: 7000 @@ -920,7 +920,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tei-gaudi:latest" + image: "ghcr.io/huggingface/tei-gaudi:1.5.0" imagePullPolicy: IfNotPresent args: - "--auto-truncate" @@ -1013,7 +1013,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -1103,8 +1103,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" - imagePullPolicy: IfNotPresent + image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -1184,8 +1184,13 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" +<<<<<<< HEAD + image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" imagePullPolicy: IfNotPresent +======= + image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" + imagePullPolicy: Always +>>>>>>> e3187be819ad088c24bf1b2cbb419255af0f2be3 volumeMounts: - mountPath: /data name: model-volume @@ -1257,18 +1262,12 @@ spec: env: - name: LLM_SERVER_HOST_IP value: chatqna-tgi - - name: LLM_SERVER_PORT - value: "2080" - name: RERANK_SERVER_HOST_IP value: chatqna-teirerank - - name: RERANK_SERVER_PORT - value: "2082" - name: RETRIEVER_SERVICE_HOST_IP value: chatqna-retriever-usvc - name: EMBEDDING_SERVER_HOST_IP value: chatqna-tei - - name: EMBEDDING_SERVER_PORT - value: "2081" - name: GUARDRAIL_SERVICE_HOST_IP value: chatqna-guardrails-usvc - name: GUARDRAIL_SERVICE_PORT @@ -1284,7 +1283,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/chatqna-guardrails:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /tmp name: tmp @@ -1325,7 +1324,7 @@ spec: spec: containers: - image: nginx:1.27.1 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always name: nginx volumeMounts: - mountPath: /etc/nginx/conf.d diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml index 988f48ca26..949e7cd8ea 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml @@ -592,7 +592,7 @@ spec: securityContext: {} image: "opea/chatqna-ui:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: ui containerPort: 5173 @@ -650,7 +650,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/dataprep-redis:v0.9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: data-prep containerPort: 6007 @@ -728,7 +728,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/embedding-tei:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: embedding-usvc containerPort: 6000 @@ -806,7 +806,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/llm-vllm:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: llm-uservice containerPort: 9000 @@ -881,7 +881,7 @@ spec: seccompProfile: type: RuntimeDefault image: "redis/redis-stack:7.2.0-v9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: data-volume @@ -956,7 +956,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/reranking-tei:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: reranking-usvc containerPort: 8000 @@ -1034,7 +1034,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/retriever-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: retriever-usvc containerPort: 7000 @@ -1106,7 +1106,7 @@ spec: privileged: true capabilities: add: ["SYS_NICE"] - image: "ghcr.io/huggingface/tei-gaudi:latest" + image: "ghcr.io/huggingface/tei-gaudi:1.5.0" imagePullPolicy: IfNotPresent args: - "--auto-truncate" @@ -1193,7 +1193,7 @@ spec: securityContext: {} image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -1281,7 +1281,7 @@ spec: - | export VLLM_CPU_KVCACHE_SPACE=40 && \ python3 -m vllm.entrypoints.openai.api_server --enforce-eager --gpu-memory-utilization 0.5 --dtype auto --model $MODEL_ID --port 2080 --tensor-parallel-size 8 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -1363,7 +1363,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/chatqna:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /tmp name: tmp @@ -1404,7 +1404,7 @@ spec: spec: containers: - image: nginx:1.27.1 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always name: nginx volumeMounts: - mountPath: /etc/nginx/conf.d diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml index 56d8720b9a..7c31d09d67 100644 --- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml +++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml @@ -455,7 +455,7 @@ spec: securityContext: {} image: "opea/chatqna-ui:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: ui containerPort: 5173 @@ -513,7 +513,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/dataprep-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: data-prep containerPort: 6007 @@ -588,7 +588,7 @@ spec: seccompProfile: type: RuntimeDefault image: "redis/redis-stack:7.2.0-v9" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /data name: data-volume @@ -663,7 +663,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/retriever-redis:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always ports: - name: retriever-usvc containerPort: 7000 @@ -741,7 +741,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tei-gaudi:latest" + image: "ghcr.io/huggingface/tei-gaudi:1.5.0" imagePullPolicy: IfNotPresent args: - "--auto-truncate" @@ -834,7 +834,7 @@ spec: seccompProfile: type: RuntimeDefault image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always args: - "--auto-truncate" volumeMounts: @@ -924,8 +924,8 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" - imagePullPolicy: IfNotPresent + image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" + imagePullPolicy: Always volumeMounts: - mountPath: /data name: model-volume @@ -997,18 +997,12 @@ spec: env: - name: LLM_SERVER_HOST_IP value: chatqna-tgi - - name: LLM_SERVER_PORT - value: "2080" - name: RERANK_SERVER_HOST_IP value: chatqna-teirerank - - name: RERANK_SERVER_PORT - value: "2082" - name: RETRIEVER_SERVICE_HOST_IP value: chatqna-retriever-usvc - name: EMBEDDING_SERVER_HOST_IP value: chatqna-tei - - name: EMBEDDING_SERVER_PORT - value: "2081" securityContext: allowPrivilegeEscalation: false capabilities: @@ -1020,7 +1014,7 @@ spec: seccompProfile: type: RuntimeDefault image: "opea/chatqna:latest" - imagePullPolicy: IfNotPresent + imagePullPolicy: Always volumeMounts: - mountPath: /tmp name: tmp @@ -1061,7 +1055,7 @@ spec: spec: containers: - image: nginx:1.27.1 - imagePullPolicy: IfNotPresent + imagePullPolicy: Always name: nginx volumeMounts: - mountPath: /etc/nginx/conf.d diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh index fcc3f80416..c186d64345 100644 --- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh @@ -22,9 +22,9 @@ function build_docker_images() { service_list="chatqna-guardrails chatqna-ui dataprep-redis retriever-redis guardrails-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - docker pull ghcr.io/huggingface/tei-gaudi:latest + docker pull ghcr.io/huggingface/tei-gaudi:1.5.0 docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh index 1d5b8bc8a7..23c302e8c9 100644 --- a/ChatQnA/tests/test_compose_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_on_gaudi.sh @@ -22,9 +22,9 @@ function build_docker_images() { service_list="chatqna chatqna-ui dataprep-redis retriever-redis nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - docker pull ghcr.io/huggingface/tei-gaudi:latest + docker pull ghcr.io/huggingface/tei-gaudi:1.5.0 docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh index f906dfabbf..3535159b3f 100644 --- a/ChatQnA/tests/test_compose_on_xeon.sh +++ b/ChatQnA/tests/test_compose_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="chatqna chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/ChatQnA/tests/test_compose_pinecone_on_xeon.sh b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh new file mode 100755 index 0000000000..a95b90c160 --- /dev/null +++ b/ChatQnA/tests/test_compose_pinecone_on_xeon.sh @@ -0,0 +1,233 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -e +IMAGE_REPO=${IMAGE_REPO:-"opea"} +IMAGE_TAG=${IMAGE_TAG:-"latest"} +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=IMAGE_TAG=${IMAGE_TAG}" +export REGISTRY=${IMAGE_REPO} +export TAG=${IMAGE_TAG} + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH/docker_image_build + git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ + + echo "Build all the images with --no-cache, check docker_image_build.log for details..." + service_list="chatqna chatqna-ui dataprep-pinecone retriever-pinecone nginx" + docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log + + docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu + docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + + docker images && sleep 1s +} + +function start_services() { + cd $WORKPATH/docker_compose/intel/cpu/xeon/ + export no_proxy=${no_proxy},${ip_address} + export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" + export RERANK_MODEL_ID="BAAI/bge-reranker-base" + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" + export PINECONE_API_KEY=${PINECONE_KEY_LANGCHAIN_TEST} + export PINECONE_INDEX_NAME="langchain-test" + export INDEX_NAME="langchain-test" + export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} + + # Start Docker Containers + docker compose -f compose_pinecone.yaml up -d > ${LOG_PATH}/start_services_with_compose.log + + n=0 + until [[ "$n" -ge 500 ]]; do + docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log + if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then + break + fi + sleep 1s + n=$((n+1)) + done +} + +function validate_service() { + local URL="$1" + local EXPECTED_RESULT="$2" + local SERVICE_NAME="$3" + local DOCKER_NAME="$4" + local INPUT_DATA="$5" + + if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then + cd $LOG_PATH + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL") + else + HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") + fi + HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://') + RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') + + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + + + # check response status + if [ "$HTTP_STATUS" -ne "200" ]; then + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + exit 1 + else + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + fi + echo "Response" + echo $RESPONSE_BODY + echo "Expected Result" + echo $EXPECTED_RESULT + # check response body + if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then + echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY" + exit 1 + else + echo "[ $SERVICE_NAME ] Content is as expected." + fi + + sleep 1s +} + +function validate_microservices() { + # Check if the microservices are running correctly. + + # tei for embedding service + validate_service \ + "${ip_address}:6006/embed" \ + "[[" \ + "tei-embedding" \ + "tei-embedding-server" \ + '{"inputs":"What is Deep Learning?"}' + + sleep 1m # retrieval can't curl as expected, try to wait for more time + + # test /v1/dataprep/delete_file + validate_service \ + "http://${ip_address}:6009/v1/dataprep/delete_file" \ + '{"status":true}' \ + "dataprep_del" \ + "dataprep-pinecone-server" + + + # test /v1/dataprep upload file + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt + validate_service \ + "http://${ip_address}:6007/v1/dataprep" \ + "Data preparation succeeded" \ + "dataprep_upload_file" \ + "dataprep-pinecone-server" + + + # retrieval microservice + test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + validate_service \ + "${ip_address}:7000/v1/retrieval" \ + " " \ + "retrieval" \ + "retriever-pinecone-server" \ + "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" + + # tei for rerank microservice + echo "Validating reranking service" + validate_service \ + "${ip_address}:8808/rerank" \ + '{"index":1,"score":' \ + "tei-rerank" \ + "tei-reranking-server" \ + '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' + + + # tgi for llm service + echo "Validating llm service" + validate_service \ + "${ip_address}:9009/generate" \ + "generated_text" \ + "tgi-llm" \ + "tgi-service" \ + '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' + +} + +function validate_megaservice() { + # Curl the Mega Service + validate_service \ + "${ip_address}:8888/v1/chatqna" \ + "data: " \ + "chatqna-megaservice" \ + "chatqna-xeon-backend-server" \ + '{"messages": "What is the revenue of Nike in 2023?"}' + +} + +function validate_frontend() { + echo "[ TEST INFO ]: --------- frontend test started ---------" + cd $WORKPATH/ui/svelte + local conda_env_name="OPEA_e2e" + export PATH=${HOME}/miniforge3/bin/:$PATH + if conda info --envs | grep -q "$conda_env_name"; then + echo "$conda_env_name exist!" + else + conda create -n ${conda_env_name} python=3.12 -y + fi + source activate ${conda_env_name} + echo "[ TEST INFO ]: --------- conda env activated ---------" + + sed -i "s/localhost/$ip_address/g" playwright.config.ts + + conda install -c conda-forge nodejs -y + npm install && npm ci && npx playwright install --with-deps + node -v && npm -v && pip list + + exit_status=0 + npx playwright test || exit_status=$? + + if [ $exit_status -ne 0 ]; then + echo "[TEST INFO]: ---------frontend test failed---------" + exit $exit_status + else + echo "[TEST INFO]: ---------frontend test passed---------" + fi +} + +function stop_docker() { + echo "In stop docker" + echo $WORKPATH + cd $WORKPATH/docker_compose/intel/cpu/xeon/ + docker compose -f compose_pinecone.yaml down +} + +function main() { + + stop_docker + + if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi + + start_time=$(date +%s) + start_services + end_time=$(date +%s) + duration=$((end_time-start_time)) + echo "Mega service start duration is $duration s" && sleep 1s + + if [ "${mode}" == "perf" ]; then + python3 $WORKPATH/tests/chatqna_benchmark.py + elif [ "${mode}" == "" ]; then + validate_microservices + echo "==== microservices validated ====" + validate_megaservice + echo "==== megaservice validated ====" + fi + + stop_docker + echo y | docker system prune + +} + +main diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh index de6cd50ede..26bef067db 100644 --- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh @@ -17,13 +17,14 @@ ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH/docker_image_build git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ + git clone https://github.com/HabanaAI/vllm-fork.git echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna chatqna-ui dataprep-redis retriever-redis llm-vllm-hpu nginx" + service_list="chatqna chatqna-ui dataprep-redis retriever-redis vllm-hpu nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - docker pull ghcr.io/huggingface/tei-gaudi:latest + docker pull ghcr.io/huggingface/tei-gaudi:1.5.0 docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_vllm_on_xeon.sh b/ChatQnA/tests/test_compose_vllm_on_xeon.sh index b664a6af8c..f53fd3aeaa 100644 --- a/ChatQnA/tests/test_compose_vllm_on_xeon.sh +++ b/ChatQnA/tests/test_compose_vllm_on_xeon.sh @@ -23,7 +23,7 @@ function build_docker_images() { service_list="chatqna chatqna-ui dataprep-redis retriever-redis vllm nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh deleted file mode 100644 index d7d1dbe6bf..0000000000 --- a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh +++ /dev/null @@ -1,183 +0,0 @@ -#!/bin/bash -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -e -IMAGE_REPO=${IMAGE_REPO:-"opea"} -IMAGE_TAG=${IMAGE_TAG:-"latest"} -echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" -echo "TAG=IMAGE_TAG=${IMAGE_TAG}" -export REGISTRY=${IMAGE_REPO} -export TAG=${IMAGE_TAG} - -WORKPATH=$(dirname "$PWD") -LOG_PATH="$WORKPATH/tests" -ip_address=$(hostname -I | awk '{print $1}') - -function build_docker_images() { - cd $WORKPATH/docker_image_build - git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../ - - echo "Build all the images with --no-cache, check docker_image_build.log for details..." - service_list="chatqna chatqna-ui dataprep-redis retriever-redis llm-vllm-ray-hpu nginx" - docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - - docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - docker pull ghcr.io/huggingface/tei-gaudi:latest - docker images && sleep 1s -} - -function start_services() { - - cd $WORKPATH/docker_compose/intel/hpu/gaudi - export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" - export RERANK_MODEL_ID="BAAI/bge-reranker-base" - export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3" - export INDEX_NAME="rag-redis" - export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} - - # Start Docker Containers - docker compose -f compose_vllm_ray.yaml up -d > ${LOG_PATH}/start_services_with_compose.log - n=0 - until [[ "$n" -ge 100 ]]; do - echo "n=$n" - docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log - if grep -q "Warmup finished" vllm_ray_service_start.log; then - break - fi - sleep 5s - n=$((n+1)) - done -} - -function validate_services() { - local URL="$1" - local EXPECTED_RESULT="$2" - local SERVICE_NAME="$3" - local DOCKER_NAME="$4" - local INPUT_DATA="$5" - - local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL") - if [ "$HTTP_STATUS" -eq 200 ]; then - echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." - - local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) - - if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then - echo "[ $SERVICE_NAME ] Content is as expected." - else - echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - else - echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log - exit 1 - fi - sleep 1s -} - -function validate_microservices() { - # Check if the microservices are running correctly. - - # tei for embedding service - validate_services \ - "${ip_address}:8090/embed" \ - "\[\[" \ - "tei-embedding" \ - "tei-embedding-gaudi-server" \ - '{"inputs":"What is Deep Learning?"}' - - sleep 1m # retrieval can't curl as expected, try to wait for more time - - # retrieval microservice - test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") - validate_services \ - "${ip_address}:7000/v1/retrieval" \ - " " \ - "retrieval" \ - "retriever-redis-server" \ - "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}" - - # tei for rerank microservice - validate_services \ - "${ip_address}:8808/rerank" \ - '{"index":1,"score":' \ - "tei-rerank" \ - "tei-reranking-gaudi-server" \ - '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' - - # vllm-on-ray for llm service - validate_services \ - "${ip_address}:8006/v1/chat/completions" \ - "content" \ - "vllm-ray-llm" \ - "vllm-ray-gaudi-server" \ - '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -} - -function validate_megaservice() { - # Curl the Mega Service - validate_services \ - "${ip_address}:8888/v1/chatqna" \ - "data: " \ - "mega-chatqna" \ - "chatqna-gaudi-backend-server" \ - '{"messages": "What is the revenue of Nike in 2023?"}' - -} - -function validate_frontend() { - cd $WORKPATH/ui/svelte - local conda_env_name="OPEA_e2e" - export PATH=${HOME}/miniforge3/bin/:$PATH - if conda info --envs | grep -q "$conda_env_name"; then - echo "$conda_env_name exist!" - else - conda create -n ${conda_env_name} python=3.12 -y - fi - source activate ${conda_env_name} - - sed -i "s/localhost/$ip_address/g" playwright.config.ts - - conda install -c conda-forge nodejs -y - npm install && npm ci && npx playwright install --with-deps - node -v && npm -v && pip list - - exit_status=0 - npx playwright test || exit_status=$? - - if [ $exit_status -ne 0 ]; then - echo "[TEST INFO]: ---------frontend test failed---------" - exit $exit_status - else - echo "[TEST INFO]: ---------frontend test passed---------" - fi -} - -function stop_docker() { - cd $WORKPATH/docker_compose/intel/hpu/gaudi - docker compose -f compose_vllm_ray.yaml down -} - -function main() { - - stop_docker - if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi - start_time=$(date +%s) - start_services - end_time=$(date +%s) - duration=$((end_time-start_time)) - echo "Mega service start duration is $duration s" - - validate_microservices - validate_megaservice - # validate_frontend - - stop_docker - echo y | docker system prune - -} - -main diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh index 22c5e8c947..f06a189ef7 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh @@ -22,9 +22,9 @@ function build_docker_images() { service_list="chatqna-without-rerank chatqna-ui dataprep-redis retriever-redis nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 - docker pull ghcr.io/huggingface/tei-gaudi:latest + docker pull ghcr.io/huggingface/tei-gaudi:1.5.0 docker images && sleep 1s } diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh index b0ffc22bcd..89b4922617 100644 --- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh +++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="chatqna-without-rerank chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 docker images && sleep 1s diff --git a/ChatQnA/tests/test_manifest_on_gaudi.sh b/ChatQnA/tests/test_manifest_on_gaudi.sh index 2716efa4b0..8bcccab377 100755 --- a/ChatQnA/tests/test_manifest_on_gaudi.sh +++ b/ChatQnA/tests/test_manifest_on_gaudi.sh @@ -111,7 +111,7 @@ function _cleanup_ns() { function install_and_validate_chatqna_guardrail() { echo "Testing manifests chatqna_guardrils" - local ns=${NAMESPACE}-gaurdrails + local ns=${NAMESPACE} _cleanup_ns $ns kubectl create namespace $ns # install guardrail @@ -119,10 +119,9 @@ function install_and_validate_chatqna_guardrail() { # Sleep enough time for chatqna_guardrail to be ready sleep 60 if kubectl rollout status deployment -n "$ns" --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then - echo "Waiting for cahtqna_guardrail pod ready done!" + echo "Waiting for chatqna_guardrail pod ready done!" else echo "Timeout waiting for chatqna_guardrail pod ready!" - _cleanup_ns $ns exit 1 fi @@ -130,10 +129,8 @@ function install_and_validate_chatqna_guardrail() { validate_chatqna $ns chatqna-guardrails local ret=$? if [ $ret -ne 0 ]; then - _cleanup_ns $ns exit 1 fi - _cleanup_ns $ns } if [ $# -eq 0 ]; then @@ -161,8 +158,7 @@ case "$1" in if [ $ret -ne 0 ]; then exit $ret fi - pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifests - set +e + pushd ChatQnA/kubernetes/intel/hpu/gaudi/manifest install_and_validate_chatqna_guardrail popd ;; diff --git a/ChatQnA/tests/test_manifest_on_xeon.sh b/ChatQnA/tests/test_manifest_on_xeon.sh index d913421a63..d405df9776 100755 --- a/ChatQnA/tests/test_manifest_on_xeon.sh +++ b/ChatQnA/tests/test_manifest_on_xeon.sh @@ -40,7 +40,7 @@ function get_end_point() { function validate_chatqna() { local ns=$1 local log=$2 - max_retry=20 + max_retry=10 # make sure microservice retriever-usvc is ready # try to curl retriever-svc for max_retry times test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") @@ -111,7 +111,7 @@ function _cleanup_ns() { function install_and_validate_chatqna_guardrail() { echo "Testing manifests chatqna_guardrils" - local ns=${NAMESPACE}-gaurdrails + local ns=${NAMESPACE} _cleanup_ns $ns kubectl create namespace $ns # install guardrail @@ -119,10 +119,9 @@ function install_and_validate_chatqna_guardrail() { # Sleep enough time for chatqna_guardrail to be ready sleep 60 if kubectl rollout status deployment -n "$ns" --timeout "$ROLLOUT_TIMEOUT_SECONDS"; then - echo "Waiting for cahtqna_guardrail pod ready done!" + echo "Waiting for chatqna_guardrail pod ready done!" else echo "Timeout waiting for chatqna_guardrail pod ready!" - _cleanup_ns $ns exit 1 fi @@ -130,10 +129,8 @@ function install_and_validate_chatqna_guardrail() { validate_chatqna $ns chatqna-guardrails local ret=$? if [ $ret -ne 0 ]; then - _cleanup_ns $ns exit 1 fi - _cleanup_ns $ns } if [ $# -eq 0 ]; then @@ -161,8 +158,7 @@ case "$1" in if [ $ret -ne 0 ]; then exit $ret fi - pushd ChatQnA/kubernetes/intel/cpu/xeon/manifests - set +e + pushd ChatQnA/kubernetes/intel/cpu/xeon/manifest install_and_validate_chatqna_guardrail popd ;; diff --git a/ChatQnA/ui/docker/Dockerfile b/ChatQnA/ui/docker/Dockerfile index ac2bb7da31..1d5115f4b5 100644 --- a/ChatQnA/ui/docker/Dockerfile +++ b/ChatQnA/ui/docker/Dockerfile @@ -23,4 +23,4 @@ RUN npm run build EXPOSE 5173 # Run the front-end application in preview mode -CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] \ No newline at end of file +CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] diff --git a/ChatQnA/ui/docker/Dockerfile.react b/ChatQnA/ui/docker/Dockerfile.react index 49bc13124c..18afc393ad 100644 --- a/ChatQnA/ui/docker/Dockerfile.react +++ b/ChatQnA/ui/docker/Dockerfile.react @@ -18,4 +18,4 @@ COPY --from=vite-app /usr/app/react/dist /usr/share/nginx/html COPY ./react/env.sh /docker-entrypoint.d/env.sh COPY ./react/nginx.conf /etc/nginx/conf.d/default.conf -RUN chmod +x /docker-entrypoint.d/env.sh \ No newline at end of file +RUN chmod +x /docker-entrypoint.d/env.sh diff --git a/ChatQnA/ui/react/.env b/ChatQnA/ui/react/.env index e5d52f4213..ae0bd3732c 100644 --- a/ChatQnA/ui/react/.env +++ b/ChatQnA/ui/react/.env @@ -1,2 +1,2 @@ VITE_BACKEND_SERVICE_ENDPOINT=http://backend_address:8888/v1/chatqna -VITE_DATA_PREP_SERVICE_URL=http://backend_address:6007/v1/dataprep \ No newline at end of file +VITE_DATA_PREP_SERVICE_URL=http://backend_address:6007/v1/dataprep diff --git a/ChatQnA/ui/react/.env.production b/ChatQnA/ui/react/.env.production index a46e1e3850..9922d60127 100644 --- a/ChatQnA/ui/react/.env.production +++ b/ChatQnA/ui/react/.env.production @@ -1,2 +1,2 @@ VITE_BACKEND_SERVICE_ENDPOINT=APP_BACKEND_SERVICE_ENDPOINT -VITE_DATA_PREP_SERVICE_URL=APP_DATA_PREP_SERVICE_URL \ No newline at end of file +VITE_DATA_PREP_SERVICE_URL=APP_DATA_PREP_SERVICE_URL diff --git a/ChatQnA/ui/react/nginx.conf b/ChatQnA/ui/react/nginx.conf index 00433fcda7..01aef12751 100644 --- a/ChatQnA/ui/react/nginx.conf +++ b/ChatQnA/ui/react/nginx.conf @@ -17,4 +17,4 @@ server { expires 1d; } } -} \ No newline at end of file +} diff --git a/ChatQnA/ui/react/public/vite.svg b/ChatQnA/ui/react/public/vite.svg index e7b8dfb1b2..ee9fadaf9c 100644 --- a/ChatQnA/ui/react/public/vite.svg +++ b/ChatQnA/ui/react/public/vite.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/ChatQnA/ui/react/src/assets/react.svg b/ChatQnA/ui/react/src/assets/react.svg index 6c87de9bb3..8e0e0f15c0 100644 --- a/ChatQnA/ui/react/src/assets/react.svg +++ b/ChatQnA/ui/react/src/assets/react.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/ChatQnA/ui/react/src/components/Conversation/DataSource.tsx b/ChatQnA/ui/react/src/components/Conversation/DataSource.tsx index cb7b326c9b..6f1b2ab06b 100644 --- a/ChatQnA/ui/react/src/components/Conversation/DataSource.tsx +++ b/ChatQnA/ui/react/src/components/Conversation/DataSource.tsx @@ -68,4 +68,4 @@ export default function DataSource({ opened, onClose }: Props) { ) -} \ No newline at end of file +} diff --git a/ChatQnA/ui/react/src/components/UserInfoModal/UserInfoModal.tsx b/ChatQnA/ui/react/src/components/UserInfoModal/UserInfoModal.tsx index 4d54180a45..4540bd4c96 100644 --- a/ChatQnA/ui/react/src/components/UserInfoModal/UserInfoModal.tsx +++ b/ChatQnA/ui/react/src/components/UserInfoModal/UserInfoModal.tsx @@ -45,4 +45,4 @@ const UserInfoModal = () => { ) } -export default UserInfoModal \ No newline at end of file +export default UserInfoModal diff --git a/ChatQnA/ui/svelte/src/app.postcss b/ChatQnA/ui/svelte/src/app.postcss index 1bb14630c8..963bbca4ef 100644 --- a/ChatQnA/ui/svelte/src/app.postcss +++ b/ChatQnA/ui/svelte/src/app.postcss @@ -83,4 +83,4 @@ a.btn { .w-12\/12 { width: 100% -} \ No newline at end of file +} diff --git a/ChatQnA/ui/svelte/src/lib/assets/voice/svg/paste.svg b/ChatQnA/ui/svelte/src/lib/assets/voice/svg/paste.svg index 9fe89acc1f..8910f0ea64 100644 --- a/ChatQnA/ui/svelte/src/lib/assets/voice/svg/paste.svg +++ b/ChatQnA/ui/svelte/src/lib/assets/voice/svg/paste.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/ChatQnA/ui/svelte/src/lib/assets/voice/svg/uploadFile.svg b/ChatQnA/ui/svelte/src/lib/assets/voice/svg/uploadFile.svg index 362a6994eb..9a77286a8f 100644 --- a/ChatQnA/ui/svelte/src/lib/assets/voice/svg/uploadFile.svg +++ b/ChatQnA/ui/svelte/src/lib/assets/voice/svg/uploadFile.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/ChatQnA/ui/svelte/tests/test_file.txt b/ChatQnA/ui/svelte/tests/test_file.txt index 93fc5da94e..bfbd73c1c3 100644 --- a/ChatQnA/ui/svelte/tests/test_file.txt +++ b/ChatQnA/ui/svelte/tests/test_file.txt @@ -101,4 +101,4 @@ Terms of Use Privacy Sitemap Copyright © 2003 - 2023. All rights reserved. -CTATECH-PROD2 \ No newline at end of file +CTATECH-PROD2 diff --git a/CodeGen/Dockerfile b/CodeGen/Dockerfile index aee27a8989..e0aa7d13f0 100644 --- a/CodeGen/Dockerfile +++ b/CodeGen/Dockerfile @@ -20,7 +20,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./codegen.py /home/user/codegen.py diff --git a/CodeGen/README.md b/CodeGen/README.md index 03288fb2df..013c31d373 100644 --- a/CodeGen/README.md +++ b/CodeGen/README.md @@ -85,12 +85,12 @@ Currently we support two ways of deploying ChatQnA services with docker compose: By default, the LLM model is set to a default value as listed below: -| Service | Model | -| ------------ | ------------------------------------------------------------------------------- | -| LLM_MODEL_ID | [meta-llama/CodeLlama-7b-hf](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | +| Service | Model | +| ------------ | --------------------------------------------------------------------------------------- | +| LLM_MODEL_ID | [Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | -[meta-llama/CodeLlama-7b-hf](https://huggingface.co/meta-llama/CodeLlama-7b-hf) is a gated model that requires submitting an access request through Hugging Face. You can replace it with another model. -Change the `LLM_MODEL_ID` below for your needs, such as: [Qwen/CodeQwen1.5-7B-Chat](https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat), [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) +[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) may be a gated model that requires submitting an access request through Hugging Face. You can replace it with another model. +Change the `LLM_MODEL_ID` below for your needs, such as: [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct) If you choose to use `meta-llama/CodeLlama-7b-hf` as LLM model, you will need to visit [here](https://huggingface.co/meta-llama/CodeLlama-7b-hf), click the `Expand to review and access` button to ask for model access. diff --git a/CodeGen/benchmark/performance/README.md b/CodeGen/benchmark/performance/README.md new file mode 100644 index 0000000000..a9d1e9d5f6 --- /dev/null +++ b/CodeGen/benchmark/performance/README.md @@ -0,0 +1,77 @@ +# CodeGen Benchmarking + +This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance. + +By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community. + +## Purpose + +We aim to run these benchmarks and share them with the OPEA community for three primary reasons: + +- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs. +- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case. +- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc. + +## Metrics + +The benchmark will report the below metrics, including: + +- Number of Concurrent Requests +- End-to-End Latency: P50, P90, P99 (in milliseconds) +- End-to-End First Token Latency: P50, P90, P99 (in milliseconds) +- Average Next Token Latency (in milliseconds) +- Average Token Latency (in milliseconds) +- Requests Per Second (RPS) +- Output Tokens Per Second +- Input Tokens Per Second + +Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`. + +## Getting Started + +We recommend using Kubernetes to deploy the CodeGen service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs. + +### Prerequisites + +- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md). + +- Every node has direct internet access +- Set up kubectl on the master node with access to the Kubernetes cluster. +- Install Python 3.8+ on the master node for running GenAIEval. +- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods. +- Ensure that the container's ulimit can meet the the number of requests. + +```bash +# The way to modify the containered ulimit: +sudo systemctl edit containerd +# Add two lines: +[Service] +LimitNOFILE=65536:1048576 + +sudo systemctl daemon-reload; sudo systemctl restart containerd +``` + +### Test Steps + +Please deploy CodeGen service before benchmarking. + +#### Run Benchmark Test + +Before the benchmark, we can configure the number of test queries and test output directory by: + +```bash +export USER_QUERIES="[128, 128, 128, 128]" +export TEST_OUTPUT_DIR="/tmp/benchmark_output" +``` + +And then run the benchmark by: + +```bash +bash benchmark.sh -n +``` + +The argument `-n` refers to the number of test nodes. + +#### Data collection + +All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps. diff --git a/CodeGen/benchmark/performance/benchmark.sh b/CodeGen/benchmark/performance/benchmark.sh new file mode 100644 index 0000000000..e1ab2dae86 --- /dev/null +++ b/CodeGen/benchmark/performance/benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deployment_type="k8s" +node_number=1 +service_port=7778 +query_per_node=128 + +benchmark_tool_path="$(pwd)/GenAIEval" + +usage() { + echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]" + echo " -d deployment_type deployment type, select between k8s and docker (default: ${deployment_type})" + echo " -n node_number Test node number, required only for k8s deployment_type, (default: ${node_number})" + echo " -i service_ip service ip, required only for docker deployment_type" + echo " -p service_port service port, required only for docker deployment_type, (default: ${service_port})" + exit 1 +} + +while getopts ":d:n:i:p:" opt; do + case ${opt} in + d ) + deployment_type=$OPTARG + ;; + n ) + node_number=$OPTARG + ;; + i ) + service_ip=$OPTARG + ;; + p ) + service_port=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then + echo "Error: service_ip is required for docker deployment_type" 1>&2 + usage +fi + +if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then + echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2 +fi + +function main() { + if [[ ! -d ${benchmark_tool_path} ]]; then + echo "Benchmark tool not found, setting up..." + setup_env + fi + run_benchmark +} + +function setup_env() { + git clone https://github.com/opea-project/GenAIEval.git + pushd ${benchmark_tool_path} + python3 -m venv stress_venv + source stress_venv/bin/activate + pip install -r requirements.txt + popd +} + +function run_benchmark() { + source ${benchmark_tool_path}/stress_venv/bin/activate + export DEPLOYMENT_TYPE=${deployment_type} + export SERVICE_IP=${service_ip:-"None"} + export SERVICE_PORT=${service_port:-"None"} + if [[ -z $USER_QUERIES ]]; then + user_query=$((query_per_node*node_number)) + export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]" + echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}." + fi + export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//') + if [[ -z $WARMUP ]]; then export WARMUP=0; fi + if [[ -z $TEST_OUTPUT_DIR ]]; then + if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}" + else + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker" + fi + echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}." + fi + + envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml + cd ${benchmark_tool_path}/evals/benchmark + python benchmark.py +} + +main diff --git a/CodeGen/benchmark/performance/benchmark.yaml b/CodeGen/benchmark/performance/benchmark.yaml new file mode 100644 index 0000000000..90d74d02bf --- /dev/null +++ b/CodeGen/benchmark/performance/benchmark.yaml @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +test_suite_config: # Overall configuration settings for the test suite + examples: ["chatqna"] # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna + deployment_type: "k8s" # Default is "k8s", can also be "docker" + service_ip: None # Leave as None for k8s, specify for Docker + service_port: None # Leave as None for k8s, specify for Docker + warm_ups: 0 # Number of test requests for warm-up + run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs + user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests at each concurrency level + query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. + random_prompt: false # Use random prompts if true, fixed prompts if false + collect_service_metric: false # Collect service metrics if true, do not collect service metrics if false + data_visualization: false # Generate data visualization if true, do not generate data visualization if false + llm_model: "Qwen/CodeQwen1.5-7B-Chat" # The LLM model used for the test + test_output_dir: "/tmp/benchmark_output" # The directory to store the test output + load_shape: # Tenant concurrency pattern + name: constant # poisson or constant(locust default load shape) + params: # Loadshape-specific parameters + constant: # Constant load shape specific parameters, activate only if load_shape.name is constant + concurrent_level: 4 # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users + # arrival_rate: 1.0 # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate + poisson: # Poisson load shape specific parameters, activate only if load_shape.name is poisson + arrival_rate: 1.0 # Request arrival rate + namespace: "" # Fill the user-defined namespace. Otherwise, it will be default. + +test_cases: + codegen: + llm: + run_test: true + service_name: "llm-dependency-svc" # Replace with your service name + parameters: + model_name: "Qwen/CodeQwen1.5-7B-Chat" + max_new_tokens: 128 + temperature: 0.01 + top_k: 10 + top_p: 0.95 + repetition_penalty: 1.03 + streaming: true + llmserve: + run_test: true + service_name: "llm-svc" # Replace with your service name + e2e: + run_test: true + service_name: "codegen-backend-svc" # Replace with your service name diff --git a/CodeGen/codegen.yaml b/CodeGen/codegen.yaml index 95f2d78e6a..8dc864f6f6 100644 --- a/CodeGen/codegen.yaml +++ b/CodeGen/codegen.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 volumes: - "./data:/data" runtime: habana diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md index 8bdde1f755..5332d719a3 100644 --- a/CodeGen/docker_compose/intel/cpu/xeon/README.md +++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md @@ -105,7 +105,7 @@ export your_no_proxy=${your_no_proxy},"External_Public_IP" export no_proxy=${your_no_proxy} export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} -export LLM_MODEL_ID="meta-llama/CodeLlama-7b-hf" +export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct" export TGI_LLM_ENDPOINT="http://${host_ip}:8028" export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} export MEGA_SERVICE_HOST_IP=${host_ip} diff --git a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml index ab1e4150ce..64b74db71f 100644 --- a/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml +++ b/CodeGen/docker_compose/intel/cpu/xeon/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "8028:80" diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md index 2a5040ea03..31cfad2929 100644 --- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md +++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md @@ -85,7 +85,7 @@ Since the `compose.yaml` will consume some environment variables, you need to se export no_proxy=${your_no_proxy} export http_proxy=${your_http_proxy} export https_proxy=${your_http_proxy} -export LLM_MODEL_ID="meta-llama/CodeLlama-7b-hf" +export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct" export TGI_LLM_ENDPOINT="http://${host_ip}:8028" export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} export MEGA_SERVICE_HOST_IP=${host_ip} diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml index 153b9f59a9..92b70b099c 100644 --- a/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/CodeGen/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8028:80" diff --git a/CodeGen/docker_compose/set_env.sh b/CodeGen/docker_compose/set_env.sh index d66a120af2..dba717b64a 100644 --- a/CodeGen/docker_compose/set_env.sh +++ b/CodeGen/docker_compose/set_env.sh @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 -export LLM_MODEL_ID="meta-llama/CodeLlama-7b-hf" +export LLM_MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct" export TGI_LLM_ENDPOINT="http://${host_ip}:8028" export MEGA_SERVICE_HOST_IP=${host_ip} export LLM_SERVICE_HOST_IP=${host_ip} diff --git a/CodeGen/kubernetes/intel/README.md b/CodeGen/kubernetes/intel/README.md index be18003b83..0c47956a8c 100644 --- a/CodeGen/kubernetes/intel/README.md +++ b/CodeGen/kubernetes/intel/README.md @@ -12,9 +12,9 @@ ## Deploy On Xeon ``` -cd GenAIExamples/CodeGen/kubernetes/intel/cpu/xeon/manifests +cd GenAIExamples/CodeGen/kubernetes/intel/cpu/xeon/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" -export MODEL_ID="meta-llama/CodeLlama-7b-hf" +export MODEL_ID="Qwen/Qwen2.5-Coder-7B-Instruct" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" codegen.yaml sed -i "s/meta-llama\/CodeLlama-7b-hf/${MODEL_ID}/g" codegen.yaml kubectl apply -f codegen.yaml @@ -23,7 +23,7 @@ kubectl apply -f codegen.yaml ## Deploy On Gaudi ``` -cd GenAIExamples/CodeGen/kubernetes/intel/hpu/gaudi/manifests +cd GenAIExamples/CodeGen/kubernetes/intel/hpu/gaudi/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" codegen.yaml kubectl apply -f codegen.yaml diff --git a/CodeGen/kubernetes/intel/cpu/xeon/gmc/codegen_xeon.yaml b/CodeGen/kubernetes/intel/cpu/xeon/gmc/codegen_xeon.yaml index dd1675ce3c..8dd3c2b574 100644 --- a/CodeGen/kubernetes/intel/cpu/xeon/gmc/codegen_xeon.yaml +++ b/CodeGen/kubernetes/intel/cpu/xeon/gmc/codegen_xeon.yaml @@ -29,6 +29,6 @@ spec: internalService: serviceName: tgi-service config: - MODEL_ID: meta-llama/CodeLlama-7b-hf + MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct endpoint: /generate isDownstreamService: true diff --git a/CodeGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md b/CodeGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md index c9d2295bef..c9911ee7de 100644 --- a/CodeGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md +++ b/CodeGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md @@ -17,7 +17,7 @@ Before deploying the react-codegen.yaml file, ensure that you have the following ``` # You may set the HUGGINGFACEHUB_API_TOKEN via method: export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" - cd GenAIExamples/CodeGen/kubernetes/intel/cpu/xeon/manifests/ui/ + cd GenAIExamples/CodeGen/kubernetes/intel/cpu/xeon/manifest/ui/ sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" react-codegen.yaml ``` b. Set the proxies based on your network configuration diff --git a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml index 96cc682660..d0070dc969 100644 --- a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml +++ b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen.yaml @@ -64,7 +64,7 @@ metadata: app.kubernetes.io/version: "2.1.0" app.kubernetes.io/managed-by: Helm data: - MODEL_ID: "meta-llama/CodeLlama-7b-hf" + MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" PORT: "2080" HF_TOKEN: "insert-your-huggingface-token-here" http_proxy: "" @@ -404,7 +404,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml index 5d77fb8cc8..a155af13a0 100644 --- a/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml +++ b/CodeGen/kubernetes/intel/cpu/xeon/manifest/codegen_react_ui.yaml @@ -126,7 +126,7 @@ spec: - name: no_proxy value: securityContext: {} - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeGen/kubernetes/intel/hpu/gaudi/gmc/codegen_gaudi.yaml b/CodeGen/kubernetes/intel/hpu/gaudi/gmc/codegen_gaudi.yaml index 2e37820577..d9a927e5c4 100644 --- a/CodeGen/kubernetes/intel/hpu/gaudi/gmc/codegen_gaudi.yaml +++ b/CodeGen/kubernetes/intel/hpu/gaudi/gmc/codegen_gaudi.yaml @@ -29,6 +29,6 @@ spec: internalService: serviceName: tgi-gaudi-svc config: - MODEL_ID: meta-llama/CodeLlama-7b-hf + MODEL_ID: Qwen/Qwen2.5-Coder-7B-Instruct endpoint: /generate isDownstreamService: true diff --git a/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml b/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml index c4a43a7c3c..dc032cd25c 100644 --- a/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml +++ b/CodeGen/kubernetes/intel/hpu/gaudi/manifest/codegen.yaml @@ -64,7 +64,7 @@ metadata: app.kubernetes.io/version: "2.1.0" app.kubernetes.io/managed-by: Helm data: - MODEL_ID: "meta-llama/CodeLlama-7b-hf" + MODEL_ID: "Qwen/Qwen2.5-Coder-7B-Instruct" PORT: "2080" HF_TOKEN: "insert-your-huggingface-token-here" http_proxy: "" @@ -405,7 +405,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeGen/tests/test_compose_on_gaudi.sh b/CodeGen/tests/test_compose_on_gaudi.sh index ec1658314a..f90e0aaa46 100644 --- a/CodeGen/tests/test_compose_on_gaudi.sh +++ b/CodeGen/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="codegen codegen-ui llm-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } diff --git a/CodeGen/tests/test_compose_on_xeon.sh b/CodeGen/tests/test_compose_on_xeon.sh index 0821cd3cb6..b184c00f31 100644 --- a/CodeGen/tests/test_compose_on_xeon.sh +++ b/CodeGen/tests/test_compose_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="codegen codegen-ui llm-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu docker images && sleep 1s } diff --git a/CodeGen/ui/docker/Dockerfile b/CodeGen/ui/docker/Dockerfile index ac2bb7da31..1d5115f4b5 100644 --- a/CodeGen/ui/docker/Dockerfile +++ b/CodeGen/ui/docker/Dockerfile @@ -23,4 +23,4 @@ RUN npm run build EXPOSE 5173 # Run the front-end application in preview mode -CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] \ No newline at end of file +CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] diff --git a/CodeGen/ui/docker/Dockerfile.react b/CodeGen/ui/docker/Dockerfile.react index 49bc13124c..18afc393ad 100644 --- a/CodeGen/ui/docker/Dockerfile.react +++ b/CodeGen/ui/docker/Dockerfile.react @@ -18,4 +18,4 @@ COPY --from=vite-app /usr/app/react/dist /usr/share/nginx/html COPY ./react/env.sh /docker-entrypoint.d/env.sh COPY ./react/nginx.conf /etc/nginx/conf.d/default.conf -RUN chmod +x /docker-entrypoint.d/env.sh \ No newline at end of file +RUN chmod +x /docker-entrypoint.d/env.sh diff --git a/CodeGen/ui/react/.env b/CodeGen/ui/react/.env index c5a7e3cad8..3ce78a405d 100644 --- a/CodeGen/ui/react/.env +++ b/CodeGen/ui/react/.env @@ -1 +1 @@ -VITE_CODE_GEN_URL=http://ip_address:7778/v1/codegen \ No newline at end of file +VITE_CODE_GEN_URL=http://ip_address:7778/v1/codegen diff --git a/CodeGen/ui/react/.env.production b/CodeGen/ui/react/.env.production index d3851cd494..8e99e67f9f 100644 --- a/CodeGen/ui/react/.env.production +++ b/CodeGen/ui/react/.env.production @@ -1 +1 @@ -VITE_CODE_GEN_URL=APP_CODE_GEN_URL \ No newline at end of file +VITE_CODE_GEN_URL=APP_CODE_GEN_URL diff --git a/CodeGen/ui/react/nginx.conf b/CodeGen/ui/react/nginx.conf index 00433fcda7..01aef12751 100644 --- a/CodeGen/ui/react/nginx.conf +++ b/CodeGen/ui/react/nginx.conf @@ -17,4 +17,4 @@ server { expires 1d; } } -} \ No newline at end of file +} diff --git a/CodeGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx b/CodeGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx index 479034cece..a21f7acc59 100644 --- a/CodeGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx +++ b/CodeGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx @@ -49,4 +49,4 @@ const CodeRender = ({ cleanCode, language, inline }:CodeRenderProps) => { } -export default CodeRender; \ No newline at end of file +export default CodeRender; diff --git a/CodeGen/ui/react/src/components/Shared/Markdown/Markdown.tsx b/CodeGen/ui/react/src/components/Shared/Markdown/Markdown.tsx index 6331c6d08d..2726e14b2e 100644 --- a/CodeGen/ui/react/src/components/Shared/Markdown/Markdown.tsx +++ b/CodeGen/ui/react/src/components/Shared/Markdown/Markdown.tsx @@ -59,4 +59,4 @@ const Markdown = ({ content }: MarkdownProps) => { />) } -export default Markdown; \ No newline at end of file +export default Markdown; diff --git a/CodeGen/ui/svelte/.prettierrc b/CodeGen/ui/svelte/.prettierrc index 3b2006102e..d146ee2b24 100644 --- a/CodeGen/ui/svelte/.prettierrc +++ b/CodeGen/ui/svelte/.prettierrc @@ -10,4 +10,4 @@ } } ] -} \ No newline at end of file +} diff --git a/CodeGen/ui/svelte/src/app.postcss b/CodeGen/ui/svelte/src/app.postcss index fa24380883..ae1c1623f5 100644 --- a/CodeGen/ui/svelte/src/app.postcss +++ b/CodeGen/ui/svelte/src/app.postcss @@ -113,4 +113,4 @@ a.btn { .w-12\/12 { width: 100% -} \ No newline at end of file +} diff --git a/CodeTrans/Dockerfile b/CodeTrans/Dockerfile index 89bb0b238d..918d936c96 100644 --- a/CodeTrans/Dockerfile +++ b/CodeTrans/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./code_translation.py /home/user/code_translation.py diff --git a/CodeTrans/README.md b/CodeTrans/README.md index b70666273f..5cfa9b27e0 100644 --- a/CodeTrans/README.md +++ b/CodeTrans/README.md @@ -77,9 +77,9 @@ Currently we support two ways of deploying Code Translation services on docker: By default, the LLM model is set to a default value as listed below: -| Service | Model | -| ------- | ----------------------------- | -| LLM | HuggingFaceH4/mistral-7b-grok | +| Service | Model | +| ------- | ---------------------------------- | +| LLM | mistralai/Mistral-7B-Instruct-v0.3 | Change the `LLM_MODEL_ID` in `docker_compose/set_env.sh` for your needs. diff --git a/CodeTrans/benchmark/performance/README.md b/CodeTrans/benchmark/performance/README.md new file mode 100644 index 0000000000..4b519de980 --- /dev/null +++ b/CodeTrans/benchmark/performance/README.md @@ -0,0 +1,77 @@ +# CodeTrans Benchmarking + +This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance. + +By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community. + +## Purpose + +We aim to run these benchmarks and share them with the OPEA community for three primary reasons: + +- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs. +- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case. +- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc. + +## Metrics + +The benchmark will report the below metrics, including: + +- Number of Concurrent Requests +- End-to-End Latency: P50, P90, P99 (in milliseconds) +- End-to-End First Token Latency: P50, P90, P99 (in milliseconds) +- Average Next Token Latency (in milliseconds) +- Average Token Latency (in milliseconds) +- Requests Per Second (RPS) +- Output Tokens Per Second +- Input Tokens Per Second + +Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`. + +## Getting Started + +We recommend using Kubernetes to deploy the CodeTrans service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs. + +### Prerequisites + +- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md). + +- Every node has direct internet access +- Set up kubectl on the master node with access to the Kubernetes cluster. +- Install Python 3.8+ on the master node for running GenAIEval. +- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods. +- Ensure that the container's ulimit can meet the the number of requests. + +```bash +# The way to modify the containered ulimit: +sudo systemctl edit containerd +# Add two lines: +[Service] +LimitNOFILE=65536:1048576 + +sudo systemctl daemon-reload; sudo systemctl restart containerd +``` + +### Test Steps + +Please deploy CodeTrans service before benchmarking. + +#### Run Benchmark Test + +Before the benchmark, we can configure the number of test queries and test output directory by: + +```bash +export USER_QUERIES="[1, 1, 1, 1]" +export TEST_OUTPUT_DIR="/tmp/benchmark_output" +``` + +And then run the benchmark by: + +```bash +bash benchmark.sh -n +``` + +The argument `-n` refers to the number of test nodes. + +#### Data collection + +All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps. diff --git a/CodeTrans/benchmark/performance/benchmark.sh b/CodeTrans/benchmark/performance/benchmark.sh new file mode 100644 index 0000000000..6eac50baf8 --- /dev/null +++ b/CodeTrans/benchmark/performance/benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deployment_type="k8s" +node_number=1 +service_port=7777 +query_per_node=128 + +benchmark_tool_path="$(pwd)/GenAIEval" + +usage() { + echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]" + echo " -d deployment_type deployment type, select between k8s and docker (default: ${deployment_type})" + echo " -n node_number Test node number, required only for k8s deployment_type, (default: ${node_number})" + echo " -i service_ip service ip, required only for docker deployment_type" + echo " -p service_port service port, required only for docker deployment_type, (default: ${service_port})" + exit 1 +} + +while getopts ":d:n:i:p:" opt; do + case ${opt} in + d ) + deployment_type=$OPTARG + ;; + n ) + node_number=$OPTARG + ;; + i ) + service_ip=$OPTARG + ;; + p ) + service_port=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then + echo "Error: service_ip is required for docker deployment_type" 1>&2 + usage +fi + +if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then + echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2 +fi + +function main() { + if [[ ! -d ${benchmark_tool_path} ]]; then + echo "Benchmark tool not found, setting up..." + setup_env + fi + run_benchmark +} + +function setup_env() { + git clone https://github.com/opea-project/GenAIEval.git + pushd ${benchmark_tool_path} + python3 -m venv stress_venv + source stress_venv/bin/activate + pip install -r requirements.txt + popd +} + +function run_benchmark() { + source ${benchmark_tool_path}/stress_venv/bin/activate + export DEPLOYMENT_TYPE=${deployment_type} + export SERVICE_IP=${service_ip:-"None"} + export SERVICE_PORT=${service_port:-"None"} + if [[ -z $USER_QUERIES ]]; then + user_query=$((query_per_node*node_number)) + export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]" + echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}." + fi + export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//') + if [[ -z $WARMUP ]]; then export WARMUP=0; fi + if [[ -z $TEST_OUTPUT_DIR ]]; then + if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}" + else + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker" + fi + echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}." + fi + + envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml + cd ${benchmark_tool_path}/evals/benchmark + python benchmark.py +} + +main diff --git a/CodeTrans/benchmark/performance/benchmark.yaml b/CodeTrans/benchmark/performance/benchmark.yaml new file mode 100644 index 0000000000..8680e886de --- /dev/null +++ b/CodeTrans/benchmark/performance/benchmark.yaml @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +test_suite_config: # Overall configuration settings for the test suite + examples: ["codetrans"] # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna + deployment_type: "k8s" # Default is "k8s", can also be "docker" + service_ip: None # Leave as None for k8s, specify for Docker + service_port: None # Leave as None for k8s, specify for Docker + warm_ups: 0 # Number of test requests for warm-up + run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs + user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests at each concurrency level + query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. + random_prompt: false # Use random prompts if true, fixed prompts if false + collect_service_metric: false # Collect service metrics if true, do not collect service metrics if false + data_visualization: false # Generate data visualization if true, do not generate data visualization if false + llm_model: "HuggingFaceH4/mistral-7b-grok" # The LLM model used for the test + test_output_dir: "/home/sdp/benchmark_output" # The directory to store the test output + load_shape: # Tenant concurrency pattern + name: constant # poisson or constant(locust default load shape) + params: # Loadshape-specific parameters + constant: # Constant load shape specific parameters, activate only if load_shape.name is constant + concurrent_level: 4 # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users + # arrival_rate: 1.0 # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate + poisson: # Poisson load shape specific parameters, activate only if load_shape.name is poisson + arrival_rate: 1.0 # Request arrival rate + namespace: "" # Fill the user-defined namespace. Otherwise, it will be default. + +test_cases: + codetrans: + llm: + run_test: true + service_name: "llm-svc" # Replace with your service name + parameters: + model_name: "HuggingFaceH4/mistral-7b-grok" + max_new_tokens: 128 + temperature: 0.01 + top_k: 10 + top_p: 0.95 + repetition_penalty: 1.03 + streaming: true + llmserve: + run_test: true + service_name: "codetrans-llm-svc" # Replace with your service name + e2e: + run_test: true + service_name: "codetrans-backend-server-svc" # Replace with your service name diff --git a/CodeTrans/codetrans.yaml b/CodeTrans/codetrans.yaml index 9d7f70b4ef..c362599788 100644 --- a/CodeTrans/codetrans.yaml +++ b/CodeTrans/codetrans.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 volumes: - "./data:/data" runtime: habana diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/README.md b/CodeTrans/docker_compose/intel/cpu/xeon/README.md index fd29ce2103..15f6414f04 100755 --- a/CodeTrans/docker_compose/intel/cpu/xeon/README.md +++ b/CodeTrans/docker_compose/intel/cpu/xeon/README.md @@ -57,9 +57,9 @@ Then run the command `docker images`, you will have the following Docker Images: By default, the LLM model is set to a default value as listed below: -| Service | Model | -| ------- | ----------------------------- | -| LLM | HuggingFaceH4/mistral-7b-grok | +| Service | Model | +| ------- | ---------------------------------- | +| LLM | mistralai/Mistral-7B-Instruct-v0.3 | Change the `LLM_MODEL_ID` below for your needs. diff --git a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml index 122028b56e..16c05cf363 100644 --- a/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml +++ b/CodeTrans/docker_compose/intel/cpu/xeon/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: codetrans-tgi-service ports: - "8008:80" diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md index 1eb1812f22..04858bc235 100755 --- a/CodeTrans/docker_compose/intel/hpu/gaudi/README.md +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/README.md @@ -49,9 +49,9 @@ Then run the command `docker images`, you will have the following Docker Images: By default, the LLM model is set to a default value as listed below: -| Service | Model | -| ------- | ----------------------------- | -| LLM | HuggingFaceH4/mistral-7b-grok | +| Service | Model | +| ------- | ---------------------------------- | +| LLM | mistralai/Mistral-7B-Instruct-v0.3 | Change the `LLM_MODEL_ID` below for your needs. diff --git a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml index 09b82ed3f6..2f87d10c24 100644 --- a/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/CodeTrans/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: codetrans-tgi-service ports: - "8008:80" diff --git a/CodeTrans/docker_compose/set_env.sh b/CodeTrans/docker_compose/set_env.sh index 5eae8f0cda..b4defd88c5 100644 --- a/CodeTrans/docker_compose/set_env.sh +++ b/CodeTrans/docker_compose/set_env.sh @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 -export LLM_MODEL_ID="HuggingFaceH4/mistral-7b-grok" +export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" export TGI_LLM_ENDPOINT="http://${host_ip}:8008" export MEGA_SERVICE_HOST_IP=${host_ip} export LLM_SERVICE_HOST_IP=${host_ip} diff --git a/CodeTrans/kubernetes/intel/README.md b/CodeTrans/kubernetes/intel/README.md index 9d6e63f8be..2f778a79b9 100644 --- a/CodeTrans/kubernetes/intel/README.md +++ b/CodeTrans/kubernetes/intel/README.md @@ -14,14 +14,14 @@ By default, the LLM model is set to a default value as listed below: |Service |Model | |---------|-------------------------| -|LLM |HuggingFaceH4/mistral-7b-grok| +|LLM |mistralai/Mistral-7B-Instruct-v0.3| Change the `MODEL_ID` in `codetrans.yaml` for your needs. ## Deploy On Xeon ```bash -cd GenAIExamples/CodeTrans/kubernetes/intel/cpu/xeon/manifests +cd GenAIExamples/CodeTrans/kubernetes/intel/cpu/xeon/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" codetrans.yaml kubectl apply -f codetrans.yaml @@ -30,7 +30,7 @@ kubectl apply -f codetrans.yaml ## Deploy On Gaudi ```bash -cd GenAIExamples/CodeTrans/kubernetes/intel/hpu/gaudi/manifests +cd GenAIExamples/CodeTrans/kubernetes/intel/hpu/gaudi/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" codetrans.yaml kubectl apply -f codetrans.yaml diff --git a/CodeTrans/kubernetes/intel/README_gmc.md b/CodeTrans/kubernetes/intel/README_gmc.md index 1b932f4ea2..0f66407d16 100644 --- a/CodeTrans/kubernetes/intel/README_gmc.md +++ b/CodeTrans/kubernetes/intel/README_gmc.md @@ -13,7 +13,7 @@ By default, the LLM model is set to a default value as listed below: |Service |Model | |---------|-------------------------| -|LLM |HuggingFaceH4/mistral-7b-grok| +|LLM |mistralai/Mistral-7B-Instruct-v0.3| Change the `MODEL_ID` in `codetrans_xeon.yaml` for your needs. diff --git a/CodeTrans/kubernetes/intel/cpu/xeon/gmc/codetrans_xeon.yaml b/CodeTrans/kubernetes/intel/cpu/xeon/gmc/codetrans_xeon.yaml index 889a1d21a6..244e7eb54a 100644 --- a/CodeTrans/kubernetes/intel/cpu/xeon/gmc/codetrans_xeon.yaml +++ b/CodeTrans/kubernetes/intel/cpu/xeon/gmc/codetrans_xeon.yaml @@ -29,6 +29,6 @@ spec: internalService: serviceName: tgi-service config: - MODEL_ID: HuggingFaceH4/mistral-7b-grok + MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 endpoint: /generate isDownstreamService: true diff --git a/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml b/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml index a68768e2f8..a778a8529e 100644 --- a/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml +++ b/CodeTrans/kubernetes/intel/cpu/xeon/manifest/codetrans.yaml @@ -64,7 +64,7 @@ metadata: app.kubernetes.io/version: "2.1.0" app.kubernetes.io/managed-by: Helm data: - MODEL_ID: "HuggingFaceH4/mistral-7b-grok" + MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" PORT: "2080" HF_TOKEN: "insert-your-huggingface-token-here" http_proxy: "" @@ -404,7 +404,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeTrans/kubernetes/intel/hpu/gaudi/gmc/codetrans_gaudi.yaml b/CodeTrans/kubernetes/intel/hpu/gaudi/gmc/codetrans_gaudi.yaml index 5bc1bd5e2d..b61ffef3ec 100644 --- a/CodeTrans/kubernetes/intel/hpu/gaudi/gmc/codetrans_gaudi.yaml +++ b/CodeTrans/kubernetes/intel/hpu/gaudi/gmc/codetrans_gaudi.yaml @@ -29,6 +29,6 @@ spec: internalService: serviceName: tgi-gaudi-svc config: - MODEL_ID: HuggingFaceH4/mistral-7b-grok + MODEL_ID: mistralai/Mistral-7B-Instruct-v0.3 endpoint: /generate isDownstreamService: true diff --git a/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml b/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml index 541f311799..a2efecf44b 100644 --- a/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml +++ b/CodeTrans/kubernetes/intel/hpu/gaudi/manifest/codetrans.yaml @@ -64,7 +64,7 @@ metadata: app.kubernetes.io/version: "2.1.0" app.kubernetes.io/managed-by: Helm data: - MODEL_ID: "HuggingFaceH4/mistral-7b-grok" + MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" PORT: "2080" HF_TOKEN: "insert-your-huggingface-token-here" http_proxy: "" @@ -405,7 +405,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/CodeTrans/tests/test_compose_on_gaudi.sh b/CodeTrans/tests/test_compose_on_gaudi.sh index b246f4dc91..c6e8b8c9bd 100644 --- a/CodeTrans/tests/test_compose_on_gaudi.sh +++ b/CodeTrans/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="codetrans codetrans-ui llm-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } @@ -31,7 +31,7 @@ function start_services() { export http_proxy=${http_proxy} export https_proxy=${http_proxy} - export LLM_MODEL_ID="HuggingFaceH4/mistral-7b-grok" + export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" export TGI_LLM_ENDPOINT="http://${ip_address}:8008" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export MEGA_SERVICE_HOST_IP=${ip_address} diff --git a/CodeTrans/tests/test_compose_on_xeon.sh b/CodeTrans/tests/test_compose_on_xeon.sh index 8cbcb23208..63fe74f058 100644 --- a/CodeTrans/tests/test_compose_on_xeon.sh +++ b/CodeTrans/tests/test_compose_on_xeon.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="codetrans codetrans-ui llm-tgi nginx" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + docker pull ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu docker images && sleep 1s } @@ -30,7 +30,7 @@ function start_services() { cd $WORKPATH/docker_compose/intel/cpu/xeon/ export http_proxy=${http_proxy} export https_proxy=${http_proxy} - export LLM_MODEL_ID="HuggingFaceH4/mistral-7b-grok" + export LLM_MODEL_ID="mistralai/Mistral-7B-Instruct-v0.3" export TGI_LLM_ENDPOINT="http://${ip_address}:8008" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export MEGA_SERVICE_HOST_IP=${ip_address} diff --git a/CodeTrans/ui/docker/Dockerfile b/CodeTrans/ui/docker/Dockerfile index ac2bb7da31..1d5115f4b5 100644 --- a/CodeTrans/ui/docker/Dockerfile +++ b/CodeTrans/ui/docker/Dockerfile @@ -23,4 +23,4 @@ RUN npm run build EXPOSE 5173 # Run the front-end application in preview mode -CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] \ No newline at end of file +CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] diff --git a/DBQnA/README.md b/DBQnA/README.md index 1164d50fbf..063475c181 100644 --- a/DBQnA/README.md +++ b/DBQnA/README.md @@ -4,6 +4,48 @@ Experience a revolutionary way to interact with your database using our DBQnA ap --- +```mermaid +flowchart LR + %% Colors %% + classDef blue fill:#ADD8E6,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.7 + classDef orange fill:#FBAA60,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.7 + classDef orchid fill:#DA70D6,stroke:#1E90FF,stroke-width:2px,fill-opacity:0.7 + classDef invisible fill:transparent,stroke:transparent; + style Text2SQL-MegaService stroke:#000000 + + %% Subgraphs %% + subgraph Text2SQL-MegaService["Text-to-SQL MegaService "] + direction LR + LLM([LLM MicroService]):::invisible + end + subgraph UserInterface[" User Interface "] + direction LR + a([User Input Query]):::orchid + UI([UI server
]):::orchid + end + + LLM_gen{{LLM Service
}} + POSTGRES_DB{{POSGRES DATABASE
}} + GW([Text-to-SQL GateWay
]):::orange + + + %% Questions interaction + direction LR + a[User Input Query] --> UI + UI --> GW + GW <==> Text2SQL-MegaService + + + %% Text-to-SQL service flow + direction TB + LLM <-.-> POSTGRES_DB + direction LR + LLM <-.-> LLM_gen + +``` + +--- + ## 🛠️ Key Features ### 💬 SQL Query Generation diff --git a/DBQnA/ui/react/nginx.conf b/DBQnA/ui/react/nginx.conf index 00433fcda7..01aef12751 100644 --- a/DBQnA/ui/react/nginx.conf +++ b/DBQnA/ui/react/nginx.conf @@ -17,4 +17,4 @@ server { expires 1d; } } -} \ No newline at end of file +} diff --git a/DBQnA/ui/react/src/logo.svg b/DBQnA/ui/react/src/logo.svg index 9dfc1c058c..7169476033 100644 --- a/DBQnA/ui/react/src/logo.svg +++ b/DBQnA/ui/react/src/logo.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/DBQnA/ui/react/src/main.tsx b/DBQnA/ui/react/src/main.tsx index a6695d54e1..7f3e441540 100644 --- a/DBQnA/ui/react/src/main.tsx +++ b/DBQnA/ui/react/src/main.tsx @@ -10,4 +10,4 @@ ReactDOM.createRoot(document.getElementById("root")!).render( -) \ No newline at end of file +) diff --git a/DocIndexRetriever/Dockerfile b/DocIndexRetriever/Dockerfile index 4e738a22a9..c8794f3efc 100644 --- a/DocIndexRetriever/Dockerfile +++ b/DocIndexRetriever/Dockerfile @@ -16,7 +16,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./retrieval_tool.py /home/user/retrieval_tool.py diff --git a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml index 1d0a445050..fc8accadcf 100644 --- a/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/DocIndexRetriever/docker_compose/intel/hpu/gaudi/compose.yaml @@ -28,7 +28,7 @@ services: TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT} HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} tei-embedding-service: - image: ghcr.io/huggingface/tei-gaudi:latest + image: ghcr.io/huggingface/tei-gaudi:1.5.0 container_name: tei-embedding-gaudi-server ports: - "8090:80" diff --git a/DocIndexRetriever/tests/test_compose_on_gaudi.sh b/DocIndexRetriever/tests/test_compose_on_gaudi.sh index 8779944be4..e652ead26b 100644 --- a/DocIndexRetriever/tests/test_compose_on_gaudi.sh +++ b/DocIndexRetriever/tests/test_compose_on_gaudi.sh @@ -24,7 +24,7 @@ function build_docker_images() { docker compose -f build.yaml build --no-cache > ${LOG_PATH}/docker_image_build.log docker pull redis/redis-stack:7.2.0-v9 - docker pull ghcr.io/huggingface/tei-gaudi:latest + docker pull ghcr.io/huggingface/tei-gaudi:1.5.0 docker images && sleep 1s } diff --git a/DocSum/Dockerfile b/DocSum/Dockerfile index 5ffd463217..d0dac691c8 100644 --- a/DocSum/Dockerfile +++ b/DocSum/Dockerfile @@ -18,7 +18,7 @@ WORKDIR /home/user RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./docsum.py /home/user/docsum.py diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml index 620ee36575..35e673563b 100644 --- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml +++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-service ports: - "8008:80" diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md index 3480750db7..6882f0ebae 100644 --- a/DocSum/docker_compose/intel/hpu/gaudi/README.md +++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md @@ -11,7 +11,7 @@ First of all, you need to build Docker Images locally. This step can be ignored As TGI Gaudi has been officially published as a Docker image, we simply need to pull it: ```bash -docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 +docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 ``` ### 2. Build LLM Image @@ -28,7 +28,7 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op ```bash git clone https://github.com/opea-project/GenAIExamples -cd GenAIExamples/DocSum/docker +cd GenAIExamples/DocSum docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . ``` @@ -53,7 +53,7 @@ docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT Then run the command `docker images`, you will have the following Docker Images: -1. `ghcr.io/huggingface/tgi-gaudi:2.0.5` +1. `ghcr.io/huggingface/tgi-gaudi:2.0.6` 2. `opea/llm-docsum-tgi:latest` 3. `opea/docsum:latest` 4. `opea/docsum-ui:latest` diff --git a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml index e9f3a96f85..71c52b40ae 100644 --- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,11 +3,12 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8008:80" environment: + HABANA_VISIBLE_DEVICES: all no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} diff --git a/DocSum/docsum.yaml b/DocSum/docsum.yaml index bc87bc5b46..9e9936ff49 100644 --- a/DocSum/docsum.yaml +++ b/DocSum/docsum.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 volumes: - "./data:/data" runtime: habana diff --git a/DocSum/kubernetes/intel/README.md b/DocSum/kubernetes/intel/README.md index dc81ee35ee..b3c797020e 100644 --- a/DocSum/kubernetes/intel/README.md +++ b/DocSum/kubernetes/intel/README.md @@ -11,7 +11,7 @@ ## Deploy On Xeon ``` -cd GenAIExamples/DocSum/kubernetes/intel/cpu/xeon/manifests +cd GenAIExamples/DocSum/kubernetes/intel/cpu/xeon/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" docsum.yaml kubectl apply -f docsum.yaml @@ -20,7 +20,7 @@ kubectl apply -f docsum.yaml ## Deploy On Gaudi ``` -cd GenAIExamples/DocSum/kubernetes/intel/hpu/gaudi/manifests +cd GenAIExamples/DocSum/kubernetes/intel/hpu/gaudi/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" docsum.yaml kubectl apply -f docsum.yaml diff --git a/DocSum/kubernetes/intel/README_gmc.md b/DocSum/kubernetes/intel/README_gmc.md index b332292110..00e9d8e1bc 100644 --- a/DocSum/kubernetes/intel/README_gmc.md +++ b/DocSum/kubernetes/intel/README_gmc.md @@ -8,8 +8,8 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm. The DocSum pipeline uses prebuilt images. The Xeon version uses the prebuilt image `llm-docsum-tgi:latest` which internally leverages the -the image `ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the -service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.5`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`. +the image `ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu`. The service is called tgi-svc. Meanwhile, the Gaudi version launches the +service tgi-gaudi-svc, which uses the image `ghcr.io/huggingface/tgi-gaudi:2.0.6`. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use `Intel/neural-chat-7b-v3-3`. [NOTE] Refer to [Docker Xeon README](https://github.com/opea-project/GenAIExamples/blob/main/DocSum/docker_compose/intel/cpu/xeon/README.md) or diff --git a/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml b/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml index 1416bdbcbc..9199888a10 100644 --- a/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml +++ b/DocSum/kubernetes/intel/cpu/xeon/manifest/docsum.yaml @@ -404,7 +404,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/README.md b/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/README.md index de7419bc90..7a4f74e848 100644 --- a/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/README.md +++ b/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/README.md @@ -16,7 +16,7 @@ Before deploying the react-docsum.yaml file, ensure that you have the following ``` # You may set the HUGGINGFACEHUB_API_TOKEN via method: export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" - cd GenAIExamples/DocSum/kubernetes/intel/cpu/xeon/manifests/ui/ + cd GenAIExamples/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/ sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" react-docsum.yaml ``` b. Set the proxies based on your network configuration diff --git a/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml b/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml index 61e8799b0e..560e34a215 100644 --- a/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml +++ b/DocSum/kubernetes/intel/cpu/xeon/manifest/ui/react-docsum.yaml @@ -126,7 +126,7 @@ spec: - name: no_proxy value: securityContext: {} - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml index 5c10f3c761..7ab1df9b1e 100644 --- a/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml +++ b/DocSum/kubernetes/intel/hpu/gaudi/manifest/docsum.yaml @@ -405,7 +405,7 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - image: "ghcr.io/huggingface/tgi-gaudi:2.0.5" + image: "ghcr.io/huggingface/tgi-gaudi:2.0.6" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh index 9c9ca92a03..12a6a8861b 100644 --- a/DocSum/tests/test_compose_on_gaudi.sh +++ b/DocSum/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="docsum docsum-ui llm-docsum-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } @@ -169,7 +169,7 @@ function main() { validate_microservices validate_megaservice - #validate_frontend + validate_frontend stop_docker echo y | docker system prune diff --git a/DocSum/tests/test_compose_on_xeon.sh b/DocSum/tests/test_compose_on_xeon.sh index fbd6797f58..7f0b2f8c53 100644 --- a/DocSum/tests/test_compose_on_xeon.sh +++ b/DocSum/tests/test_compose_on_xeon.sh @@ -168,7 +168,7 @@ function main() { validate_microservices validate_megaservice - #validate_frontend + validate_frontend stop_docker echo y | docker system prune diff --git a/DocSum/ui/docker/Dockerfile b/DocSum/ui/docker/Dockerfile index ac2bb7da31..1d5115f4b5 100644 --- a/DocSum/ui/docker/Dockerfile +++ b/DocSum/ui/docker/Dockerfile @@ -23,4 +23,4 @@ RUN npm run build EXPOSE 5173 # Run the front-end application in preview mode -CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] \ No newline at end of file +CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] diff --git a/DocSum/ui/docker/Dockerfile.react b/DocSum/ui/docker/Dockerfile.react index aa8f3fe78e..9458864658 100644 --- a/DocSum/ui/docker/Dockerfile.react +++ b/DocSum/ui/docker/Dockerfile.react @@ -21,4 +21,4 @@ EXPOSE 80 COPY --from=vite-app /usr/app/react/nginx.conf /etc/nginx/conf.d/default.conf COPY --from=vite-app /usr/app/react/dist /usr/share/nginx/html -ENTRYPOINT ["nginx", "-g", "daemon off;"] \ No newline at end of file +ENTRYPOINT ["nginx", "-g", "daemon off;"] diff --git a/DocSum/ui/react/.env b/DocSum/ui/react/.env index 88e4996a29..b25495154a 100644 --- a/DocSum/ui/react/.env +++ b/DocSum/ui/react/.env @@ -1 +1 @@ -VITE_DOC_SUM_URL=http://backend_address:8888/v1/docsum \ No newline at end of file +VITE_DOC_SUM_URL=http://backend_address:8888/v1/docsum diff --git a/DocSum/ui/react/nginx.conf b/DocSum/ui/react/nginx.conf index 00433fcda7..01aef12751 100644 --- a/DocSum/ui/react/nginx.conf +++ b/DocSum/ui/react/nginx.conf @@ -17,4 +17,4 @@ server { expires 1d; } } -} \ No newline at end of file +} diff --git a/DocSum/ui/react/src/components/DocSum/DocSum.tsx b/DocSum/ui/react/src/components/DocSum/DocSum.tsx index 9e7472c658..2fa9fd4a34 100644 --- a/DocSum/ui/react/src/components/DocSum/DocSum.tsx +++ b/DocSum/ui/react/src/components/DocSum/DocSum.tsx @@ -150,4 +150,4 @@ const DocSum = () => { ) } -export default DocSum \ No newline at end of file +export default DocSum diff --git a/DocSum/ui/react/src/components/DocSum/FileUpload.tsx b/DocSum/ui/react/src/components/DocSum/FileUpload.tsx index baa77670ca..1790cfb161 100644 --- a/DocSum/ui/react/src/components/DocSum/FileUpload.tsx +++ b/DocSum/ui/react/src/components/DocSum/FileUpload.tsx @@ -64,4 +64,4 @@ export function FileUpload(props: Partial) { ); -} \ No newline at end of file +} diff --git a/DocSum/ui/react/src/components/Shared/CodeRender/CodeRender.tsx b/DocSum/ui/react/src/components/Shared/CodeRender/CodeRender.tsx index 479034cece..a21f7acc59 100644 --- a/DocSum/ui/react/src/components/Shared/CodeRender/CodeRender.tsx +++ b/DocSum/ui/react/src/components/Shared/CodeRender/CodeRender.tsx @@ -49,4 +49,4 @@ const CodeRender = ({ cleanCode, language, inline }:CodeRenderProps) => { } -export default CodeRender; \ No newline at end of file +export default CodeRender; diff --git a/DocSum/ui/react/src/components/Shared/Markdown/Markdown.tsx b/DocSum/ui/react/src/components/Shared/Markdown/Markdown.tsx index dc4b2d3083..77471cd190 100644 --- a/DocSum/ui/react/src/components/Shared/Markdown/Markdown.tsx +++ b/DocSum/ui/react/src/components/Shared/Markdown/Markdown.tsx @@ -55,4 +55,4 @@ const Markdown = ({ content }: MarkdownProps) => { />) } -export default Markdown; \ No newline at end of file +export default Markdown; diff --git a/DocSum/ui/svelte/src/lib/shared/Network.ts b/DocSum/ui/svelte/src/lib/shared/Network.ts index 172b25a441..705019c897 100644 --- a/DocSum/ui/svelte/src/lib/shared/Network.ts +++ b/DocSum/ui/svelte/src/lib/shared/Network.ts @@ -20,7 +20,12 @@ export async function fetchTextStream(query: string | Blob, params: string, file const url = `${DOC_BASE_URL}`; // Ensure the URL is constructed correctly const formData = new FormData(); - if (params === "doc_id" && file) { + if (!file) { + file = new Blob([""], { type: "text/plain" }); + fileName = "empty.txt"; + } + + if (params === "doc_id") { formData.append("files", file, fileName); formData.append("messages", query); } else if (params === "text") { diff --git a/EdgeCraftRAG/Dockerfile b/EdgeCraftRAG/Dockerfile new file mode 100644 index 0000000000..3c9711deaf --- /dev/null +++ b/EdgeCraftRAG/Dockerfile @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY ./edgecraftrag /home/user/edgecraftrag +COPY ./chatqna.py /home/user/chatqna.py + +WORKDIR /home/user/edgecraftrag +RUN pip install --no-cache-dir -r requirements.txt + +WORKDIR /home/user + +USER user + +RUN echo 'ulimit -S -n 999999' >> ~/.bashrc + +ENTRYPOINT ["python", "chatqna.py"] diff --git a/EdgeCraftRAG/Dockerfile.server b/EdgeCraftRAG/Dockerfile.server new file mode 100644 index 0000000000..c04dc0a545 --- /dev/null +++ b/EdgeCraftRAG/Dockerfile.server @@ -0,0 +1,35 @@ +FROM python:3.11-slim + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN apt-get update && apt-get install -y gnupg wget +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \ + gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg +RUN echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy client" | \ + tee /etc/apt/sources.list.d/intel-gpu-jammy.list +RUN apt-get update +RUN apt-get install -y \ + intel-opencl-icd intel-level-zero-gpu level-zero intel-level-zero-gpu-raytracing \ + intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ + libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY ./edgecraftrag /home/user/edgecraftrag + +WORKDIR /home/user/edgecraftrag +RUN pip install --no-cache-dir -r requirements.txt + +WORKDIR /home/user/ + +USER user + +ENTRYPOINT ["python", "-m", "edgecraftrag.server"] diff --git a/EdgeCraftRAG/README.md b/EdgeCraftRAG/README.md new file mode 100644 index 0000000000..da8d2efb07 --- /dev/null +++ b/EdgeCraftRAG/README.md @@ -0,0 +1,274 @@ +# Edge Craft Retrieval-Augmented Generation + +Edge Craft RAG (EC-RAG) is a customizable, tunable and production-ready +Retrieval-Augmented Generation system for edge solutions. It is designed to +curate the RAG pipeline to meet hardware requirements at edge with guaranteed +quality and performance. + +## Quick Start Guide + +### Run Containers with Docker Compose + +```bash +cd GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc + +export MODEL_PATH="your model path for all your models" +export DOC_PATH="your doc path for uploading a dir of files" +export HOST_IP="your host ip" +export UI_SERVICE_PORT="port for UI service" + +# Optional for vllm endpoint +export vLLM_ENDPOINT="http://${HOST_IP}:8008" + +# If you have a proxy configured, uncomment below line +# export no_proxy=$no_proxy,${HOST_IP},edgecraftrag,edgecraftrag-server +# If you have a HF mirror configured, it will be imported to the container +# export HF_ENDPOINT="your HF mirror endpoint" + +# By default, the ports of the containers are set, uncomment if you want to change +# export MEGA_SERVICE_PORT=16011 +# export PIPELINE_SERVICE_PORT=16011 + +docker compose up -d +``` + +### (Optional) Build Docker Images for Mega Service, Server and UI by your own + +```bash +cd GenAIExamples/EdgeCraftRAG + +docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag:latest -f Dockerfile . +docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-server:latest -f Dockerfile.server . +docker build --build-arg http_proxy=$HTTP_PROXY --build-arg https_proxy=$HTTPS_PROXY --build-arg no_proxy=$NO_PROXY -t opea/edgecraftrag-ui:latest -f ui/docker/Dockerfile.ui . +``` + +### ChatQnA with LLM Example (Command Line) + +```bash +cd GenAIExamples/EdgeCraftRAG + +# Activate pipeline test_pipeline_local_llm +curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @tests/test_pipeline_local_llm.json | jq '.' + +# Will need to wait for several minutes +# Expected output: +# { +# "idx": "3214cf25-8dff-46e6-b7d1-1811f237cf8c", +# "name": "rag_test", +# "comp_type": "pipeline", +# "node_parser": { +# "idx": "ababed12-c192-4cbb-b27e-e49c76a751ca", +# "parser_type": "simple", +# "chunk_size": 400, +# "chunk_overlap": 48 +# }, +# "indexer": { +# "idx": "46969b63-8a32-4142-874d-d5c86ee9e228", +# "indexer_type": "faiss_vector", +# "model": { +# "idx": "7aae57c0-13a4-4a15-aecb-46c2ec8fe738", +# "type": "embedding", +# "model_id": "BAAI/bge-small-en-v1.5", +# "model_path": "/home/user/models/bge_ov_embedding", +# "device": "auto" +# } +# }, +# "retriever": { +# "idx": "3747fa59-ff9b-49b6-a8e8-03cdf8c979a4", +# "retriever_type": "vectorsimilarity", +# "retrieve_topk": 30 +# }, +# "postprocessor": [ +# { +# "idx": "d46a6cae-ba7a-412e-85b7-d334f175efaa", +# "postprocessor_type": "reranker", +# "model": { +# "idx": "374e7471-bd7d-41d0-b69d-a749a052b4b0", +# "type": "reranker", +# "model_id": "BAAI/bge-reranker-large", +# "model_path": "/home/user/models/bge_ov_reranker", +# "device": "auto" +# }, +# "top_n": 2 +# } +# ], +# "generator": { +# "idx": "52d8f112-6290-4dd3-bc28-f9bd5deeb7c8", +# "generator_type": "local", +# "model": { +# "idx": "fa0c11e1-46d1-4df8-a6d8-48cf6b99eff3", +# "type": "llm", +# "model_id": "qwen2-7b-instruct", +# "model_path": "/home/user/models/qwen2-7b-instruct/INT4_compressed_weights", +# "device": "auto" +# } +# }, +# "status": { +# "active": true +# } +# } + +# Prepare data from local directory +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d '{"local_path":"#REPLACE WITH YOUR LOCAL DOC DIR#"}' | jq '.' + +# Validate Mega Service +curl -X POST http://${HOST_IP}:16011/v1/chatqna -H "Content-Type: application/json" -d '{"messages":"#REPLACE WITH YOUR QUESTION HERE#", "top_n":5, "max_tokens":512}' | jq '.' +``` + +### ChatQnA with LLM Example (UI) + +Open your browser, access http://${HOST_IP}:8082 + +> Your browser should be running on the same host of your console, otherwise you will need to access UI with your host domain name instead of ${HOST_IP}. + +### (Optional) Launch vLLM with OpenVINO service + +```bash +# 1. export LLM_MODEL +export LLM_MODEL="your model id" +# 2. Uncomment below code in 'GenAIExamples/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml' + # vllm-service: + # image: vllm:openvino + # container_name: vllm-openvino-server + # depends_on: + # - vllm-service + # ports: + # - "8008:80" + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # vLLM_ENDPOINT: ${vLLM_ENDPOINT} + # LLM_MODEL: ${LLM_MODEL} + # entrypoint: /bin/bash -c "\ + # cd / && \ + # export VLLM_CPU_KVCACHE_SPACE=50 && \ + # python3 -m vllm.entrypoints.openai.api_server \ + # --model '${LLM_MODEL}' \ + # --host 0.0.0.0 \ + # --port 80" +``` + +## Advanced User Guide + +### Pipeline Management + +#### Create a pipeline + +```bash +curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline.json | jq '.' +``` + +It will take some time to prepare the embedding model. + +#### Upload a text + +```bash +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.' +``` + +#### Provide a query to retrieve context with similarity search. + +```bash +curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d @examples/test_query.json | jq '.' +``` + +#### Create the second pipeline test2 + +```bash +curl -X POST http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" -d @examples/test_pipeline2.json | jq '.' +``` + +#### Check all pipelines + +```bash +curl -X GET http://${HOST_IP}:16010/v1/settings/pipelines -H "Content-Type: application/json" | jq '.' +``` + +#### Compare similarity retrieval (test1) and keyword retrieval (test2) + +```bash +# Activate pipeline test1 +curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test1 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.' +# Similarity retrieval +curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.' + +# Activate pipeline test2 +curl -X PATCH http://${HOST_IP}:16010/v1/settings/pipelines/test2 -H "Content-Type: application/json" -d '{"active": "true"}' | jq '.' +# Keyword retrieval +curl -X POST http://${HOST_IP}:16010/v1/retrieval -H "Content-Type: application/json" -d '{"messages":"number"}' | jq '.' + +``` + +### Model Management + +#### Load a model + +```bash +curl -X POST http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" -d @examples/test_model_load.json | jq '.' +``` + +It will take some time to load the model. + +#### Check all models + +```bash +curl -X GET http://${HOST_IP}:16010/v1/settings/models -H "Content-Type: application/json" | jq '.' +``` + +#### Update a model + +```bash +curl -X PATCH http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" -d @examples/test_model_update.json | jq '.' +``` + +#### Check a certain model + +```bash +curl -X GET http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" | jq '.' +``` + +#### Delete a model + +```bash +curl -X DELETE http://${HOST_IP}:16010/v1/settings/models/BAAI/bge-reranker-large -H "Content-Type: application/json" | jq '.' +``` + +### File Management + +#### Add a text + +```bash +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data.json | jq '.' +``` + +#### Add files from existed file path + +```bash +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_dir.json | jq '.' +curl -X POST http://${HOST_IP}:16010/v1/data -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.' +``` + +#### Check all files + +```bash +curl -X GET http://${HOST_IP}:16010/v1/data/files -H "Content-Type: application/json" | jq '.' +``` + +#### Check one file + +```bash +curl -X GET http://${HOST_IP}:16010/v1/data/files/test2.docx -H "Content-Type: application/json" | jq '.' +``` + +#### Delete a file + +```bash +curl -X DELETE http://${HOST_IP}:16010/v1/data/files/test2.docx -H "Content-Type: application/json" | jq '.' +``` + +#### Update a file + +```bash +curl -X PATCH http://${HOST_IP}:16010/v1/data/files/test.pdf -H "Content-Type: application/json" -d @examples/test_data_file.json | jq '.' +``` diff --git a/EdgeCraftRAG/chatqna.py b/EdgeCraftRAG/chatqna.py new file mode 100644 index 0000000000..1afa9621ce --- /dev/null +++ b/EdgeCraftRAG/chatqna.py @@ -0,0 +1,72 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from comps import MicroService, ServiceOrchestrator, ServiceType + +MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "127.0.0.1") +MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 16011)) +PIPELINE_SERVICE_HOST_IP = os.getenv("PIPELINE_SERVICE_HOST_IP", "127.0.0.1") +PIPELINE_SERVICE_PORT = int(os.getenv("PIPELINE_SERVICE_PORT", 16010)) + +from comps import Gateway, MegaServiceEndpoint +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatMessage, + UsageInfo, +) +from fastapi import Request +from fastapi.responses import StreamingResponse + + +class EdgeCraftRagGateway(Gateway): + def __init__(self, megaservice, host="0.0.0.0", port=16011): + super().__init__( + megaservice, host, port, str(MegaServiceEndpoint.CHAT_QNA), ChatCompletionRequest, ChatCompletionResponse + ) + + async def handle_request(self, request: Request): + input = await request.json() + result_dict, runtime_graph = await self.megaservice.schedule(initial_inputs=input) + for node, response in result_dict.items(): + if isinstance(response, StreamingResponse): + return response + last_node = runtime_graph.all_leaves()[-1] + response = result_dict[last_node] + choices = [] + usage = UsageInfo() + choices.append( + ChatCompletionResponseChoice( + index=0, + message=ChatMessage(role="assistant", content=response), + finish_reason="stop", + ) + ) + return ChatCompletionResponse(model="edgecraftrag", choices=choices, usage=usage) + + +class EdgeCraftRagService: + def __init__(self, host="0.0.0.0", port=16010): + self.host = host + self.port = port + self.megaservice = ServiceOrchestrator() + + def add_remote_service(self): + edgecraftrag = MicroService( + name="pipeline", + host=PIPELINE_SERVICE_HOST_IP, + port=PIPELINE_SERVICE_PORT, + endpoint="/v1/chatqna", + use_remote_service=True, + service_type=ServiceType.UNDEFINED, + ) + self.megaservice.add(edgecraftrag) + self.gateway = EdgeCraftRagGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port) + + +if __name__ == "__main__": + edgecraftrag = EdgeCraftRagService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT) + edgecraftrag.add_remote_service() diff --git a/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml new file mode 100644 index 0000000000..f877b7c582 --- /dev/null +++ b/EdgeCraftRAG/docker_compose/intel/gpu/arc/compose.yaml @@ -0,0 +1,78 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + server: + image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest} + container_name: edgecraftrag-server + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_ENDPOINT: ${HF_ENDPOINT} + vLLM_ENDPOINT: ${vLLM_ENDPOINT} + volumes: + - ${MODEL_PATH:-${PWD}}:/home/user/models + - ${DOC_PATH:-${PWD}}:/home/user/docs + ports: + - ${PIPELINE_SERVICE_PORT:-16010}:${PIPELINE_SERVICE_PORT:-16010} + devices: + - /dev/dri:/dev/dri + group_add: + - video + ecrag: + image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest} + container_name: edgecraftrag + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011} + MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}} + PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010} + PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}} + ports: + - ${MEGA_SERVICE_PORT:-16011}:${MEGA_SERVICE_PORT:-16011} + depends_on: + - server + ui: + image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest} + container_name: edgecraftrag-ui + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MEGA_SERVICE_PORT: ${MEGA_SERVICE_PORT:-16011} + MEGA_SERVICE_HOST_IP: ${MEGA_SERVICE_HOST_IP:-${HOST_IP}} + PIPELINE_SERVICE_PORT: ${PIPELINE_SERVICE_PORT:-16010} + PIPELINE_SERVICE_HOST_IP: ${PIPELINE_SERVICE_HOST_IP:-${HOST_IP}} + UI_SERVICE_PORT: ${UI_SERVICE_PORT:-8082} + UI_SERVICE_HOST_IP: ${UI_SERVICE_HOST_IP:-0.0.0.0} + ports: + - ${UI_SERVICE_PORT:-8082}:${UI_SERVICE_PORT:-8082} + restart: always + depends_on: + - server + - ecrag + # vllm-service: + # image: vllm:openvino + # container_name: vllm-openvino-server + # ports: + # - "8008:80" + # environment: + # no_proxy: ${no_proxy} + # http_proxy: ${http_proxy} + # https_proxy: ${https_proxy} + # vLLM_ENDPOINT: ${vLLM_ENDPOINT} + # LLM_MODEL: ${LLM_MODEL} + # entrypoint: /bin/bash -c "\ + # cd / && \ + # export VLLM_CPU_KVCACHE_SPACE=50 && \ + # python3 -m vllm.entrypoints.openai.api_server \ + # --model '${LLM_MODEL}' \ + # --host 0.0.0.0 \ + # --port 80" + +networks: + default: + driver: bridge diff --git a/EdgeCraftRAG/docker_image_build/build.yaml b/EdgeCraftRAG/docker_image_build/build.yaml new file mode 100644 index 0000000000..e0cc355cc6 --- /dev/null +++ b/EdgeCraftRAG/docker_image_build/build.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + server: + build: + context: .. + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + dockerfile: ./Dockerfile.server + image: ${REGISTRY:-opea}/edgecraftrag-server:${TAG:-latest} + ui: + build: + context: .. + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + dockerfile: ./ui/docker/Dockerfile.ui + image: ${REGISTRY:-opea}/edgecraftrag-ui:${TAG:-latest} + ecrag: + build: + context: .. + args: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + dockerfile: ./Dockerfile + image: ${REGISTRY:-opea}/edgecraftrag:${TAG:-latest} diff --git a/EdgeCraftRAG/edgecraftrag/__init__.py b/EdgeCraftRAG/edgecraftrag/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/EdgeCraftRAG/edgecraftrag/api/__init__.py b/EdgeCraftRAG/edgecraftrag/api/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/__init__.py b/EdgeCraftRAG/edgecraftrag/api/v1/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api/v1/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py new file mode 100644 index 0000000000..dfd32c29e6 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api/v1/chatqna.py @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from comps.cores.proto.api_protocol import ChatCompletionRequest +from edgecraftrag.context import ctx +from fastapi import FastAPI + +chatqna_app = FastAPI() + + +# Retrieval +@chatqna_app.post(path="/v1/retrieval") +async def retrieval(request: ChatCompletionRequest): + nodeswithscore = ctx.get_pipeline_mgr().run_retrieve(chat_request=request) + print(nodeswithscore) + if nodeswithscore is not None: + ret = [] + for n in nodeswithscore: + ret.append((n.node.node_id, n.node.text, n.score)) + return ret + + return "Not found" + + +# ChatQnA +@chatqna_app.post(path="/v1/chatqna") +async def chatqna(request: ChatCompletionRequest): + ret = ctx.get_pipeline_mgr().run_pipeline(chat_request=request) + return str(ret) diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/data.py b/EdgeCraftRAG/edgecraftrag/api/v1/data.py new file mode 100644 index 0000000000..fb5b327929 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api/v1/data.py @@ -0,0 +1,102 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from edgecraftrag.api_schema import DataIn, FilesIn +from edgecraftrag.context import ctx +from fastapi import FastAPI + +data_app = FastAPI() + + +# Upload a text or files +@data_app.post(path="/v1/data") +async def add_data(request: DataIn): + nodelist = None + + docs = [] + if request.text is not None: + docs.extend(ctx.get_file_mgr().add_text(text=request.text)) + if request.local_path is not None: + docs.extend(ctx.get_file_mgr().add_files(docs=request.local_path)) + + nodelist = ctx.get_pipeline_mgr().run_data_prepare(docs=docs) + if nodelist is None: + return "Error" + pl = ctx.get_pipeline_mgr().get_active_pipeline() + # TODO: Need bug fix, when node_parser is None + ctx.get_node_mgr().add_nodes(pl.node_parser.idx, nodelist) + return "Done" + + +# Upload files by a list of file_path +@data_app.post(path="/v1/data/files") +async def add_files(request: FilesIn): + nodelist = None + + docs = [] + if request.local_paths is not None: + docs.extend(ctx.get_file_mgr().add_files(docs=request.local_paths)) + + nodelist = ctx.get_pipeline_mgr().run_data_prepare(docs=docs) + if nodelist is None: + return "Error" + pl = ctx.get_pipeline_mgr().get_active_pipeline() + # TODO: Need bug fix, when node_parser is None + ctx.get_node_mgr().add_nodes(pl.node_parser.idx, nodelist) + return "Done" + + +# GET files +@data_app.get(path="/v1/data/files") +async def get_files(): + return ctx.get_file_mgr().get_files() + + +# GET a file +@data_app.get(path="/v1/data/files") +async def get_file_docs(name): + return ctx.get_file_mgr().get_docs_by_file(name) + + +# DELETE a file +@data_app.delete(path="/v1/data/files/{name}") +async def delete_file(name): + if ctx.get_file_mgr().del_file(name): + # TODO: delete the nodes related to the file + all_docs = ctx.get_file_mgr().get_all_docs() + + nodelist = ctx.get_pipeline_mgr().run_data_prepare(docs=all_docs) + if nodelist is None: + return "Error" + pl = ctx.get_pipeline_mgr().get_active_pipeline() + ctx.get_node_mgr().del_nodes_by_np_idx(pl.node_parser.idx) + ctx.get_node_mgr().add_nodes(pl.node_parser.idx, nodelist) + return f"File {name} is deleted" + else: + return f"File {name} not found" + + +# UPDATE a file +@data_app.patch(path="/v1/data/files/{name}") +async def update_file(name, request: DataIn): + # 1. Delete + if ctx.get_file_mgr().del_file(name): + # 2. Add + docs = [] + if request.text is not None: + docs.extend(ctx.get_file_mgr().add_text(text=request.text)) + if request.local_path is not None: + docs.extend(ctx.get_file_mgr().add_files(docs=request.local_path)) + + # 3. Re-run the pipeline + # TODO: update the nodes related to the file + all_docs = ctx.get_file_mgr().get_all_docs() + nodelist = ctx.get_pipeline_mgr().run_data_prepare(docs=all_docs) + if nodelist is None: + return "Error" + pl = ctx.get_pipeline_mgr().get_active_pipeline() + ctx.get_node_mgr().del_nodes_by_np_idx(pl.node_parser.idx) + ctx.get_node_mgr().add_nodes(pl.node_parser.idx, nodelist) + return f"File {name} is updated" + else: + return f"File {name} not found" diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/model.py b/EdgeCraftRAG/edgecraftrag/api/v1/model.py new file mode 100644 index 0000000000..17044ae91f --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api/v1/model.py @@ -0,0 +1,76 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import gc + +from edgecraftrag.api_schema import ModelIn +from edgecraftrag.context import ctx +from fastapi import FastAPI + +model_app = FastAPI() + + +# GET Models +@model_app.get(path="/v1/settings/models") +async def get_models(): + return ctx.get_model_mgr().get_models() + + +# GET Model +@model_app.get(path="/v1/settings/models/{model_id:path}") +async def get_model_by_name(model_id): + return ctx.get_model_mgr().get_model_by_name(model_id) + + +# POST Model +@model_app.post(path="/v1/settings/models") +async def add_model(request: ModelIn): + modelmgr = ctx.get_model_mgr() + # Currently use asyncio.Lock() to deal with multi-requests + async with modelmgr._lock: + model = modelmgr.search_model(request) + if model is None: + model = modelmgr.load_model(request) + modelmgr.add(model) + return model.model_id + " model loaded" + + +# PATCH Model +@model_app.patch(path="/v1/settings/models/{model_id:path}") +async def update_model(model_id, request: ModelIn): + # The process of patch model is : 1.delete model 2.create model + active_pl = ctx.get_pipeline_mgr().get_active_pipeline() + modelmgr = ctx.get_model_mgr() + if active_pl and active_pl.model_existed(model_id): + return "Model is being used by active pipeline, unable to update model" + else: + async with modelmgr._lock: + if modelmgr.get_model_by_name(model_id) is None: + # Need to make sure original model still exists before updating model + # to prevent memory leak in concurrent requests situation + return "Model " + model_id + " not exists" + model = modelmgr.search_model(request) + if model is None: + modelmgr.del_model_by_name(model_id) + # Clean up memory occupation + gc.collect() + # load new model + model = modelmgr.load_model(request) + modelmgr.add(model) + return model + + +# DELETE Model +@model_app.delete(path="/v1/settings/models/{model_id:path}") +async def delete_model(model_id): + active_pl = ctx.get_pipeline_mgr().get_active_pipeline() + if active_pl and active_pl.model_existed(model_id): + return "Model is being used by active pipeline, unable to remove" + else: + modelmgr = ctx.get_model_mgr() + # Currently use asyncio.Lock() to deal with multi-requests + async with modelmgr._lock: + response = modelmgr.del_model_by_name(model_id) + # Clean up memory occupation + gc.collect() + return response diff --git a/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py new file mode 100644 index 0000000000..9d008e82f7 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api/v1/pipeline.py @@ -0,0 +1,180 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import weakref + +from edgecraftrag.api_schema import PipelineCreateIn +from edgecraftrag.base import IndexerType, InferenceType, ModelType, NodeParserType, PostProcessorType, RetrieverType +from edgecraftrag.components.generator import QnAGenerator +from edgecraftrag.components.indexer import VectorIndexer +from edgecraftrag.components.node_parser import HierarchyNodeParser, SimpleNodeParser, SWindowNodeParser +from edgecraftrag.components.postprocessor import MetadataReplaceProcessor, RerankProcessor +from edgecraftrag.components.retriever import AutoMergeRetriever, SimpleBM25Retriever, VectorSimRetriever +from edgecraftrag.context import ctx +from fastapi import FastAPI + +pipeline_app = FastAPI() + + +# GET Pipelines +@pipeline_app.get(path="/v1/settings/pipelines") +async def get_pipelines(): + return ctx.get_pipeline_mgr().get_pipelines() + + +# GET Pipeline +@pipeline_app.get(path="/v1/settings/pipelines/{name}") +async def get_pipeline(name): + return ctx.get_pipeline_mgr().get_pipeline_by_name_or_id(name) + + +# POST Pipeline +@pipeline_app.post(path="/v1/settings/pipelines") +async def add_pipeline(request: PipelineCreateIn): + pl = ctx.get_pipeline_mgr().get_pipeline_by_name_or_id(request.name) + if pl is None: + pl = ctx.get_pipeline_mgr().create_pipeline(request.name) + active_pl = ctx.get_pipeline_mgr().get_active_pipeline() + if pl == active_pl: + if not request.active: + pass + else: + return "Unable to patch an active pipeline..." + update_pipeline_handler(pl, request) + return pl + + +# PATCH Pipeline +@pipeline_app.patch(path="/v1/settings/pipelines/{name}") +async def update_pipeline(name, request: PipelineCreateIn): + pl = ctx.get_pipeline_mgr().get_pipeline_by_name_or_id(name) + if pl is None: + return None + active_pl = ctx.get_pipeline_mgr().get_active_pipeline() + if pl == active_pl: + if not request.active: + pass + else: + return "Unable to patch an active pipeline..." + async with ctx.get_pipeline_mgr()._lock: + update_pipeline_handler(pl, request) + return pl + + +def update_pipeline_handler(pl, req): + if req.node_parser is not None: + np = req.node_parser + found_parser = ctx.get_node_parser_mgr().search_parser(np) + if found_parser is not None: + pl.node_parser = found_parser + else: + match np.parser_type: + case NodeParserType.SIMPLE: + pl.node_parser = SimpleNodeParser(chunk_size=np.chunk_size, chunk_overlap=np.chunk_overlap) + case NodeParserType.HIERARCHY: + """ + HierarchyNodeParser is for Auto Merging Retriever + (https://docs.llamaindex.ai/en/stable/examples/retrievers/auto_merging_retriever/) + By default, the hierarchy is: + 1st level: chunk size 2048 + 2nd level: chunk size 512 + 3rd level: chunk size 128 + Please set chunk size with List. e.g. chunk_size=[2048,512,128] + """ + pl.node_parser = HierarchyNodeParser.from_defaults( + chunk_sizes=np.chunk_sizes, chunk_overlap=np.chunk_overlap + ) + case NodeParserType.SENTENCEWINDOW: + pl.node_parser = SWindowNodeParser.from_defaults(window_size=np.window_size) + ctx.get_node_parser_mgr().add(pl.node_parser) + + if req.indexer is not None: + ind = req.indexer + found_indexer = ctx.get_indexer_mgr().search_indexer(ind) + if found_indexer is not None: + pl.indexer = found_indexer + else: + embed_model = None + if ind.embedding_model: + embed_model = ctx.get_model_mgr().search_model(ind.embedding_model) + if embed_model is None: + ind.embedding_model.model_type = ModelType.EMBEDDING + embed_model = ctx.get_model_mgr().load_model(ind.embedding_model) + ctx.get_model_mgr().add(embed_model) + match ind.indexer_type: + case IndexerType.DEFAULT_VECTOR | IndexerType.FAISS_VECTOR: + # TODO: **RISK** if considering 2 pipelines with different + # nodes, but same indexer, what will happen? + pl.indexer = VectorIndexer(embed_model, ind.indexer_type) + case _: + pass + ctx.get_indexer_mgr().add(pl.indexer) + + if req.retriever is not None: + retr = req.retriever + match retr.retriever_type: + case RetrieverType.VECTORSIMILARITY: + if pl.indexer is not None: + pl.retriever = VectorSimRetriever(pl.indexer, similarity_top_k=retr.retrieve_topk) + else: + return "No indexer" + case RetrieverType.AUTOMERGE: + # AutoMergeRetriever looks at a set of leaf nodes and recursively "merges" subsets of leaf nodes that reference a parent node + if pl.indexer is not None: + pl.retriever = AutoMergeRetriever(pl.indexer, similarity_top_k=retr.retrieve_topk) + else: + return "No indexer" + case RetrieverType.BM25: + if pl.indexer is not None: + pl.retriever = SimpleBM25Retriever(pl.indexer, similarity_top_k=retr.retrieve_topk) + else: + return "No indexer" + case _: + pass + + if req.postprocessor is not None: + pp = req.postprocessor + pl.postprocessor = [] + for processor in pp: + match processor.processor_type: + case PostProcessorType.RERANKER: + if processor.reranker_model: + prm = processor.reranker_model + reranker_model = ctx.get_model_mgr().search_model(prm) + if reranker_model is None: + prm.model_type = ModelType.RERANKER + reranker_model = ctx.get_model_mgr().load_model(prm) + ctx.get_model_mgr().add(reranker_model) + postprocessor = RerankProcessor(reranker_model, processor.top_n) + pl.postprocessor.append(postprocessor) + else: + return "No reranker model" + case PostProcessorType.METADATAREPLACE: + postprocessor = MetadataReplaceProcessor(target_metadata_key="window") + pl.postprocessor.append(postprocessor) + + if req.generator: + gen = req.generator + if gen.model is None: + return "No ChatQnA Model" + if gen.inference_type == InferenceType.VLLM: + if gen.model.model_id: + model_ref = gen.model.model_id + else: + model_ref = gen.model.model_path + pl.generator = QnAGenerator(model_ref, gen.prompt_path, gen.inference_type) + elif gen.inference_type == InferenceType.LOCAL: + model = ctx.get_model_mgr().search_model(gen.model) + if model is None: + gen.model.model_type = ModelType.LLM + model = ctx.get_model_mgr().load_model(gen.model) + ctx.get_model_mgr().add(model) + # Use weakref to achieve model deletion and memory release + model_ref = weakref.ref(model) + pl.generator = QnAGenerator(model_ref, gen.prompt_path, gen.inference_type) + else: + return "Inference Type Not Supported" + + if pl.status.active != req.active: + ctx.get_pipeline_mgr().activate_pipeline(pl.name, req.active, ctx.get_node_mgr()) + return pl diff --git a/EdgeCraftRAG/edgecraftrag/api_schema.py b/EdgeCraftRAG/edgecraftrag/api_schema.py new file mode 100644 index 0000000000..1f124a7f9a --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/api_schema.py @@ -0,0 +1,62 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional + +from pydantic import BaseModel + + +class ModelIn(BaseModel): + model_type: Optional[str] = "LLM" + model_id: Optional[str] + model_path: Optional[str] = "./" + device: Optional[str] = "cpu" + + +class NodeParserIn(BaseModel): + chunk_size: Optional[int] = None + chunk_overlap: Optional[int] = None + chunk_sizes: Optional[list] = None + parser_type: str + window_size: Optional[int] = None + + +class IndexerIn(BaseModel): + indexer_type: str + embedding_model: Optional[ModelIn] = None + + +class RetrieverIn(BaseModel): + retriever_type: str + retrieve_topk: Optional[int] = 3 + + +class PostProcessorIn(BaseModel): + processor_type: str + reranker_model: Optional[ModelIn] = None + top_n: Optional[int] = 5 + + +class GeneratorIn(BaseModel): + prompt_path: Optional[str] = None + model: Optional[ModelIn] = None + inference_type: Optional[str] = "local" + + +class PipelineCreateIn(BaseModel): + name: Optional[str] = None + node_parser: Optional[NodeParserIn] = None + indexer: Optional[IndexerIn] = None + retriever: Optional[RetrieverIn] = None + postprocessor: Optional[list[PostProcessorIn]] = None + generator: Optional[GeneratorIn] = None + active: Optional[bool] = False + + +class DataIn(BaseModel): + text: Optional[str] = None + local_path: Optional[str] = None + + +class FilesIn(BaseModel): + local_paths: Optional[list[str]] = None diff --git a/EdgeCraftRAG/edgecraftrag/base.py b/EdgeCraftRAG/edgecraftrag/base.py new file mode 100644 index 0000000000..d8c7aaef84 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/base.py @@ -0,0 +1,128 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import abc +import uuid +from enum import Enum +from typing import Any, Callable, List, Optional + +from pydantic import BaseModel, ConfigDict, Field, model_serializer + + +class CompType(str, Enum): + + DEFAULT = "default" + MODEL = "model" + PIPELINE = "pipeline" + NODEPARSER = "node_parser" + INDEXER = "indexer" + RETRIEVER = "retriever" + POSTPROCESSOR = "postprocessor" + GENERATOR = "generator" + FILE = "file" + + +class ModelType(str, Enum): + + EMBEDDING = "embedding" + RERANKER = "reranker" + LLM = "llm" + + +class FileType(str, Enum): + TEXT = "text" + VISUAL = "visual" + AURAL = "aural" + VIRTUAL = "virtual" + OTHER = "other" + + +class NodeParserType(str, Enum): + + DEFAULT = "default" + SIMPLE = "simple" + HIERARCHY = "hierarchical" + SENTENCEWINDOW = "sentencewindow" + + +class IndexerType(str, Enum): + + DEFAULT = "default" + FAISS_VECTOR = "faiss_vector" + DEFAULT_VECTOR = "vector" + + +class RetrieverType(str, Enum): + + DEFAULT = "default" + VECTORSIMILARITY = "vectorsimilarity" + AUTOMERGE = "auto_merge" + BM25 = "bm25" + + +class PostProcessorType(str, Enum): + + RERANKER = "reranker" + METADATAREPLACE = "metadata_replace" + + +class GeneratorType(str, Enum): + + CHATQNA = "chatqna" + + +class InferenceType(str, Enum): + + LOCAL = "local" + VLLM = "vllm" + + +class CallbackType(str, Enum): + + DATAPREP = "dataprep" + RETRIEVE = "retrieve" + PIPELINE = "pipeline" + + +class BaseComponent(BaseModel): + + model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) + + idx: str = Field(default_factory=lambda: str(uuid.uuid4())) + name: Optional[str] = Field(default="") + comp_type: str = Field(default="") + comp_subtype: Optional[str] = Field(default="") + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "name": self.name, + "comp_type": self.comp_type, + "comp_subtype": self.comp_subtype, + } + return set + + @abc.abstractmethod + def run(self, **kwargs) -> Any: + pass + + +class BaseMgr: + + def __init__(self): + self.components = {} + + def add(self, comp: BaseComponent): + self.components[comp.idx] = comp + + def get(self, idx: str) -> BaseComponent: + if idx in self.components: + return self.components[idx] + else: + return None + + def remove(self, idx): + # remove the reference count + # after reference count == 0, object memory can be freed with Garbage Collector + del self.components[idx] diff --git a/EdgeCraftRAG/edgecraftrag/components/__init__.py b/EdgeCraftRAG/edgecraftrag/components/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/EdgeCraftRAG/edgecraftrag/components/data.py b/EdgeCraftRAG/edgecraftrag/components/data.py new file mode 100644 index 0000000000..e7fa19e7ad --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/data.py @@ -0,0 +1,65 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from typing import Any, List, Optional + +from edgecraftrag.base import BaseComponent, CompType, FileType +from llama_index.core.schema import Document +from pydantic import BaseModel, Field, model_serializer + + +class File(BaseComponent): + file_path: str = Field(default="") + comp_subtype: str = Field(default="") + documents: List[Document] = Field(default=[]) + + def __init__(self, file_name: Optional[str] = None, file_path: Optional[str] = None, content: Optional[str] = None): + super().__init__(comp_type=CompType.FILE) + + if not file_name and not file_path: + raise ValueError("File name or path must be provided") + + _path = Path(file_path) if file_path else None + if file_name: + self.name = file_name + else: + self.name = _path.name + self.file_path = _path + self.comp_subtype = FileType.TEXT + if _path and _path.exists(): + self.documents.extend(convert_file_to_documents(_path)) + if content: + self.documents.extend(convert_text_to_documents(content)) + + def run(self, **kwargs) -> Any: + pass + + @model_serializer + def ser_model(self): + set = { + "file_name": self.name, + "file_id": self.idx, + "file_type": self.comp_subtype, + "file_path": str(self.file_path), + "docs_count": len(self.documents), + } + return set + + +def convert_text_to_documents(text) -> List[Document]: + return [Document(text=text, metadata={"file_name": "text"})] + + +def convert_file_to_documents(file_path) -> List[Document]: + from llama_index.core import SimpleDirectoryReader + + supported_exts = [".pdf", ".txt", ".doc", ".docx", ".pptx", ".ppt", ".csv", ".md", ".html", ".rst"] + if file_path.is_dir(): + docs = SimpleDirectoryReader(input_dir=file_path, recursive=True, required_exts=supported_exts).load_data() + elif file_path.is_file(): + docs = SimpleDirectoryReader(input_files=[file_path], required_exts=supported_exts).load_data() + else: + docs = [] + + return docs diff --git a/EdgeCraftRAG/edgecraftrag/components/generator.py b/EdgeCraftRAG/edgecraftrag/components/generator.py new file mode 100644 index 0000000000..cbfd6686d0 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/generator.py @@ -0,0 +1,194 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import dataclasses +import os + +from comps import GeneratedDoc, opea_telemetry +from edgecraftrag.base import BaseComponent, CompType, GeneratorType +from fastapi.responses import StreamingResponse +from langchain_core.prompts import PromptTemplate +from llama_index.llms.openai_like import OpenAILike +from pydantic import model_serializer + + +@opea_telemetry +def post_process_text(text: str): + if text == " ": + return "data: @#$\n\n" + if text == "\n": + return "data:
\n\n" + if text.isspace(): + return None + new_text = text.replace(" ", "@#$") + return f"data: {new_text}\n\n" + + +class QnAGenerator(BaseComponent): + + def __init__(self, llm_model, prompt_template, inference_type, **kwargs): + BaseComponent.__init__( + self, + comp_type=CompType.GENERATOR, + comp_subtype=GeneratorType.CHATQNA, + ) + self.inference_type = inference_type + self._REPLACE_PAIRS = ( + ("\n\n", "\n"), + ("\t\n", "\n"), + ) + template = prompt_template + self.prompt = ( + DocumentedContextRagPromptTemplate.from_file(template) + if os.path.isfile(template) + else DocumentedContextRagPromptTemplate.from_template(template) + ) + self.llm = llm_model + if isinstance(llm_model, str): + self.model_id = llm_model + else: + self.model_id = llm_model().model_id + + def clean_string(self, string): + ret = string + for p in self._REPLACE_PAIRS: + ret = ret.replace(*p) + return ret + + def run(self, chat_request, retrieved_nodes, **kwargs): + if self.llm() is None: + # This could happen when User delete all LLMs through RESTful API + return "No LLM available, please load LLM" + # query transformation + text_gen_context = "" + for n in retrieved_nodes: + origin_text = n.node.get_text() + text_gen_context += self.clean_string(origin_text.strip()) + + query = chat_request.messages + prompt_str = self.prompt.format(input=query, context=text_gen_context) + generate_kwargs = dict( + temperature=chat_request.temperature, + do_sample=chat_request.temperature > 0.0, + top_p=chat_request.top_p, + top_k=chat_request.top_k, + typical_p=chat_request.typical_p, + repetition_penalty=chat_request.repetition_penalty, + ) + self.llm().generate_kwargs = generate_kwargs + + return self.llm().complete(prompt_str) + + def run_vllm(self, chat_request, retrieved_nodes, **kwargs): + if self.llm is None: + return "No LLM provided, please provide model_id_or_path" + # query transformation + text_gen_context = "" + for n in retrieved_nodes: + origin_text = n.node.get_text() + text_gen_context += self.clean_string(origin_text.strip()) + + query = chat_request.messages + prompt_str = self.prompt.format(input=query, context=text_gen_context) + + llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8008") + model_name = self.llm + llm = OpenAILike( + api_key="fake", + api_base=llm_endpoint + "/v1", + max_tokens=chat_request.max_tokens, + model=model_name, + top_p=chat_request.top_p, + temperature=chat_request.temperature, + streaming=chat_request.stream, + ) + + if chat_request.stream: + + async def stream_generator(): + response = await llm.astream_complete(prompt_str) + async for text in response: + output = text.text + yield f"data: {output}\n\n" + + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + response = llm.complete(prompt_str) + response = response.text + + return GeneratedDoc(text=response, prompt=prompt_str) + + @model_serializer + def ser_model(self): + set = {"idx": self.idx, "generator_type": self.comp_subtype, "model": self.model_id} + return set + + +@dataclasses.dataclass +class INSTRUCTIONS: + IM_START = "You are an AI assistant that helps users answer questions given a specific context." + SUCCINCT = "Ensure your response is succinct" + ACCURATE = "Ensure your response is accurate." + SUCCINCT_AND_ACCURATE = "Ensure your response is succinct. Try to be accurate if possible." + ACCURATE_AND_SUCCINCT = "Ensure your response is accurate. Try to be succinct if possible." + NO_RAMBLING = "Avoid posing new questions or self-questioning and answering, and refrain from repeating words in your response." + SAY_SOMETHING = "Avoid meaningless answer such a random symbol or blanks." + ENCOURAGE = "If you cannot well understand the question, try to translate it into English, and translate the answer back to the language of the question." + NO_IDEA = ( + 'If the answer is not discernible, please respond with "Sorry. I have no idea" in the language of the question.' + ) + CLOZE_TEST = """The task is a fill-in-the-blank/cloze test.""" + NO_MEANINGLESS_SYMBOLS = "Meaningless symbols and ``` should not be included in your response." + ADAPT_NATIVE_LANGUAGE = "Please try to think like a person that speak the same language that the question used." + + +def _is_cloze(question): + return ("()" in question or "()" in question) and ("填" in question or "fill" in question or "cloze" in question) + + +# depreciated +def get_instructions(question): + # naive pre-retrieval rewrite + # cloze + if _is_cloze(question): + instructions = [ + INSTRUCTIONS.CLOZE_TEST, + ] + else: + instructions = [ + INSTRUCTIONS.ACCURATE_AND_SUCCINCT, + INSTRUCTIONS.NO_RAMBLING, + INSTRUCTIONS.NO_MEANINGLESS_SYMBOLS, + ] + return ["System: {}".format(_) for _ in instructions] + + +def preprocess_question(question): + if _is_cloze(question): + question = question.replace(" ", "").replace("(", "(").replace(")", ")") + # .replace("()", " <|blank|> ") + ret = "User: Please finish the following fill-in-the-blank question marked by $$$ at the beginning and end. Make sure all the () are filled.\n$$$\n{}\n$$$\nAssistant: ".format( + question + ) + else: + ret = "User: {}\nAssistant: 从上下文提供的信息中可以知道,".format(question) + return ret + + +class DocumentedContextRagPromptTemplate(PromptTemplate): + + def format(self, **kwargs) -> str: + # context = '\n'.join([clean_string(f"{_.page_content}".strip()) for i, _ in enumerate(kwargs["context"])]) + context = kwargs["context"] + question = kwargs["input"] + preprocessed_question = preprocess_question(question) + if "instructions" in self.template: + instructions = get_instructions(question) + prompt_str = self.template.format( + context=context, instructions="\n".join(instructions), input=preprocessed_question + ) + else: + prompt_str = self.template.format(context=context, input=preprocessed_question) + return prompt_str diff --git a/EdgeCraftRAG/edgecraftrag/components/indexer.py b/EdgeCraftRAG/edgecraftrag/components/indexer.py new file mode 100644 index 0000000000..83346d4901 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/indexer.py @@ -0,0 +1,45 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +import faiss +from edgecraftrag.base import BaseComponent, CompType, IndexerType +from llama_index.core import StorageContext, VectorStoreIndex +from llama_index.vector_stores.faiss import FaissVectorStore +from pydantic import model_serializer + + +class VectorIndexer(BaseComponent, VectorStoreIndex): + + def __init__(self, embed_model, vector_type): + BaseComponent.__init__( + self, + comp_type=CompType.INDEXER, + comp_subtype=vector_type, + ) + self.model = embed_model + if not embed_model: + # Settings.embed_model should be set to None when embed_model is None to avoid 'no oneapi key' error + from llama_index.core import Settings + + Settings.embed_model = None + match vector_type: + case IndexerType.DEFAULT_VECTOR: + VectorStoreIndex.__init__(self, embed_model=embed_model, nodes=[]) + case IndexerType.FAISS_VECTOR: + if embed_model: + d = embed_model._model.request.outputs[0].get_partial_shape()[2].get_length() + else: + d = 128 + faiss_index = faiss.IndexFlatL2(d) + faiss_store = StorageContext.from_defaults(vector_store=FaissVectorStore(faiss_index=faiss_index)) + VectorStoreIndex.__init__(self, embed_model=embed_model, nodes=[], storage_context=faiss_store) + + def run(self, **kwargs) -> Any: + pass + + @model_serializer + def ser_model(self): + set = {"idx": self.idx, "indexer_type": self.comp_subtype, "model": self.model} + return set diff --git a/EdgeCraftRAG/edgecraftrag/components/model.py b/EdgeCraftRAG/edgecraftrag/components/model.py new file mode 100644 index 0000000000..72ee7f16e0 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/model.py @@ -0,0 +1,74 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Optional + +from edgecraftrag.base import BaseComponent, CompType, ModelType +from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding +from llama_index.llms.openvino import OpenVINOLLM +from llama_index.postprocessor.openvino_rerank import OpenVINORerank +from pydantic import Field, model_serializer + + +class BaseModelComponent(BaseComponent): + + model_id: Optional[str] = Field(default="") + model_path: Optional[str] = Field(default="") + device: Optional[str] = Field(default="cpu") + + def run(self, **kwargs) -> Any: + pass + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "type": self.comp_subtype, + "model_id": self.model_id, + "model_path": self.model_path, + "device": self.device, + } + return set + + +class OpenVINOEmbeddingModel(BaseModelComponent, OpenVINOEmbedding): + + def __init__(self, model_id, model_path, device): + OpenVINOEmbedding.create_and_save_openvino_model(model_id, model_path) + OpenVINOEmbedding.__init__(self, model_id_or_path=model_path, device=device) + self.comp_type = CompType.MODEL + self.comp_subtype = ModelType.EMBEDDING + self.model_id = model_id + self.model_path = model_path + self.device = device + + +class OpenVINORerankModel(BaseModelComponent, OpenVINORerank): + + def __init__(self, model_id, model_path, device): + OpenVINORerank.create_and_save_openvino_model(model_id, model_path) + OpenVINORerank.__init__( + self, + model_id_or_path=model_path, + device=device, + ) + self.comp_type = CompType.MODEL + self.comp_subtype = ModelType.RERANKER + self.model_id = model_id + self.model_path = model_path + self.device = device + + +class OpenVINOLLMModel(BaseModelComponent, OpenVINOLLM): + + def __init__(self, model_id, model_path, device): + OpenVINOLLM.__init__( + self, + model_id_or_path=model_path, + device_map=device, + ) + self.comp_type = CompType.MODEL + self.comp_subtype = ModelType.LLM + self.model_id = model_id + self.model_path = model_path + self.device = device diff --git a/EdgeCraftRAG/edgecraftrag/components/node_parser.py b/EdgeCraftRAG/edgecraftrag/components/node_parser.py new file mode 100644 index 0000000000..cd50f45347 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/node_parser.py @@ -0,0 +1,85 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from edgecraftrag.base import BaseComponent, CompType, NodeParserType +from llama_index.core.node_parser import HierarchicalNodeParser, SentenceSplitter, SentenceWindowNodeParser +from pydantic import model_serializer + + +class SimpleNodeParser(BaseComponent, SentenceSplitter): + + # Use super for SentenceSplitter since it's __init__ will cleanup + # BaseComponent fields + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.comp_type = CompType.NODEPARSER + self.comp_subtype = NodeParserType.SIMPLE + + def run(self, **kwargs) -> Any: + for k, v in kwargs.items(): + if k == "docs": + return self.get_nodes_from_documents(v, show_progress=False) + + return None + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "parser_type": self.comp_subtype, + "chunk_size": self.chunk_size, + "chunk_overlap": self.chunk_overlap, + } + return set + + +class HierarchyNodeParser(BaseComponent, HierarchicalNodeParser): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.comp_type = CompType.NODEPARSER + self.comp_subtype = NodeParserType.HIERARCHY + + def run(self, **kwargs) -> Any: + for k, v in kwargs.items(): + if k == "docs": + return self.get_nodes_from_documents(v, show_progress=False) + + return None + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "parser_type": self.comp_subtype, + "chunk_size": self.chunk_sizes, + "chunk_overlap": None, + } + return set + + +class SWindowNodeParser(BaseComponent, SentenceWindowNodeParser): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.comp_type = CompType.NODEPARSER + self.comp_subtype = NodeParserType.SENTENCEWINDOW + + def run(self, **kwargs) -> Any: + for k, v in kwargs.items(): + if k == "docs": + return self.get_nodes_from_documents(v, show_progress=False) + + return None + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "parser_type": self.comp_subtype, + "chunk_size": None, + "chunk_overlap": None, + } + return set diff --git a/EdgeCraftRAG/edgecraftrag/components/pipeline.py b/EdgeCraftRAG/edgecraftrag/components/pipeline.py new file mode 100644 index 0000000000..4a2932e00b --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/pipeline.py @@ -0,0 +1,160 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Callable, List, Optional + +from comps.cores.proto.api_protocol import ChatCompletionRequest +from edgecraftrag.base import BaseComponent, CallbackType, CompType, InferenceType +from edgecraftrag.components.postprocessor import RerankProcessor +from llama_index.core.schema import Document, QueryBundle +from pydantic import BaseModel, Field, model_serializer + + +class PipelineStatus(BaseModel): + active: bool = False + + +class Pipeline(BaseComponent): + + node_parser: Optional[BaseComponent] = Field(default=None) + indexer: Optional[BaseComponent] = Field(default=None) + retriever: Optional[BaseComponent] = Field(default=None) + postprocessor: Optional[List[BaseComponent]] = Field(default=None) + generator: Optional[BaseComponent] = Field(default=None) + status: PipelineStatus = Field(default=PipelineStatus()) + run_pipeline_cb: Optional[Callable[..., Any]] = Field(default=None) + run_retriever_cb: Optional[Callable[..., Any]] = Field(default=None) + run_data_prepare_cb: Optional[Callable[..., Any]] = Field(default=None) + + def __init__( + self, + name, + ): + super().__init__(name=name, comp_type=CompType.PIPELINE) + if self.name == "" or self.name is None: + self.name = self.idx + self.run_pipeline_cb = run_test_generator + self.run_retriever_cb = run_test_retrieve + self.run_data_prepare_cb = run_simple_doc + self._node_changed = True + + # TODO: consider race condition + @property + def node_changed(self) -> bool: + return self._node_changed + + # TODO: update doc changes + # TODO: more operations needed, add, del, modify + def update_nodes(self, nodes): + print("updating nodes ", nodes) + if self.indexer is not None: + self.indexer.insert_nodes(nodes) + + # TODO: check more conditions + def check_active(self, nodelist): + if self._node_changed and nodelist is not None: + self.update_nodes(nodelist) + + # Implement abstract run function + # callback dispatcher + def run(self, **kwargs) -> Any: + print(kwargs) + if "cbtype" in kwargs: + if kwargs["cbtype"] == CallbackType.DATAPREP: + if "docs" in kwargs: + return self.run_data_prepare_cb(self, docs=kwargs["docs"]) + if kwargs["cbtype"] == CallbackType.RETRIEVE: + if "chat_request" in kwargs: + return self.run_retriever_cb(self, chat_request=kwargs["chat_request"]) + if kwargs["cbtype"] == CallbackType.PIPELINE: + if "chat_request" in kwargs: + return self.run_pipeline_cb(self, chat_request=kwargs["chat_request"]) + + def update(self, node_parser=None, indexer=None, retriever=None, postprocessor=None, generator=None): + if node_parser is not None: + self.node_parser = node_parser + if indexer is not None: + self.indexer = indexer + if retriever is not None: + self.retriever = retriever + if postprocessor is not None: + self.postprocessor = postprocessor + if generator is not None: + self.generator = generator + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "name": self.name, + "comp_type": self.comp_type, + "node_parser": self.node_parser, + "indexer": self.indexer, + "retriever": self.retriever, + "postprocessor": self.postprocessor, + "generator": self.generator, + "status": self.status, + } + return set + + def model_existed(self, model_id: str) -> bool: + # judge if the given model is existed in a pipeline by model_id + if self.indexer: + if hasattr(self.indexer, "_embed_model") and self.indexer._embed_model.model_id == model_id: + return True + if hasattr(self.indexer, "_llm") and self.indexer._llm.model_id == model_id: + return True + if self.postprocessor: + for processor in self.postprocessor: + if hasattr(processor, "model_id") and processor.model_id == model_id: + return True + if self.generator: + llm = self.generator.llm + if llm() and llm().model_id == model_id: + return True + return False + + +# Test callback to retrieve nodes from query +def run_test_retrieve(pl: Pipeline, chat_request: ChatCompletionRequest) -> Any: + query = chat_request.messages + retri_res = pl.retriever.run(query=query) + query_bundle = QueryBundle(query) + if pl.postprocessor: + for processor in pl.postprocessor: + if ( + isinstance(processor, RerankProcessor) + and chat_request.top_n != ChatCompletionRequest.model_fields["top_n"].default + ): + processor.top_n = chat_request.top_n + retri_res = processor.run(retri_res=retri_res, query_bundle=query_bundle) + return retri_res + + +def run_simple_doc(pl: Pipeline, docs: List[Document]) -> Any: + n = pl.node_parser.run(docs=docs) + if pl.indexer is not None: + pl.indexer.insert_nodes(n) + print(pl.indexer._index_struct) + return n + + +def run_test_generator(pl: Pipeline, chat_request: ChatCompletionRequest) -> Any: + query = chat_request.messages + retri_res = pl.retriever.run(query=query) + query_bundle = QueryBundle(query) + if pl.postprocessor: + for processor in pl.postprocessor: + if ( + isinstance(processor, RerankProcessor) + and chat_request.top_n != ChatCompletionRequest.model_fields["top_n"].default + ): + processor.top_n = chat_request.top_n + retri_res = processor.run(retri_res=retri_res, query_bundle=query_bundle) + if pl.generator is None: + return "No Generator Specified" + if pl.generator.inference_type == InferenceType.LOCAL: + answer = pl.generator.run(chat_request, retri_res) + elif pl.generator.inference_type == InferenceType.VLLM: + answer = pl.generator.run_vllm(chat_request, retri_res) + return answer diff --git a/EdgeCraftRAG/edgecraftrag/components/postprocessor.py b/EdgeCraftRAG/edgecraftrag/components/postprocessor.py new file mode 100644 index 0000000000..672826bdbb --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/postprocessor.py @@ -0,0 +1,64 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from edgecraftrag.base import BaseComponent, CompType, PostProcessorType +from llama_index.core.postprocessor import MetadataReplacementPostProcessor +from pydantic import model_serializer + + +class RerankProcessor(BaseComponent): + + def __init__(self, rerank_model, top_n): + BaseComponent.__init__( + self, + comp_type=CompType.POSTPROCESSOR, + comp_subtype=PostProcessorType.RERANKER, + ) + self.model = rerank_model + self.top_n = top_n + + def run(self, **kwargs) -> Any: + self.model.top_n = self.top_n + query_bundle = None + query_str = None + if "retri_res" in kwargs: + nodes = kwargs["retri_res"] + if "query_bundle" in kwargs: + query_bundle = kwargs["query_bundle"] + if "query_str" in kwargs: + query_str = kwargs["query_str"] + return self.model.postprocess_nodes(nodes, query_bundle=query_bundle, query_str=query_str) + + @model_serializer + def ser_model(self): + set = {"idx": self.idx, "postprocessor_type": self.comp_subtype, "model": self.model, "top_n": self.top_n} + return set + + +class MetadataReplaceProcessor(BaseComponent, MetadataReplacementPostProcessor): + + def __init__(self, target_metadata_key="window"): + BaseComponent.__init__( + self, + target_metadata_key=target_metadata_key, + comp_type=CompType.POSTPROCESSOR, + comp_subtype=PostProcessorType.METADATAREPLACE, + ) + + def run(self, **kwargs) -> Any: + query_bundle = None + query_str = None + if "retri_res" in kwargs: + nodes = kwargs["retri_res"] + if "query_bundle" in kwargs: + query_bundle = kwargs["query_bundle"] + if "query_str" in kwargs: + query_str = kwargs["query_str"] + return self.postprocess_nodes(nodes, query_bundle=query_bundle, query_str=query_str) + + @model_serializer + def ser_model(self): + set = {"idx": self.idx, "postprocessor_type": self.comp_subtype, "model": None, "top_n": None} + return set diff --git a/EdgeCraftRAG/edgecraftrag/components/retriever.py b/EdgeCraftRAG/edgecraftrag/components/retriever.py new file mode 100644 index 0000000000..cba251b2a8 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/components/retriever.py @@ -0,0 +1,104 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, List, cast + +from edgecraftrag.base import BaseComponent, CompType, RetrieverType +from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever +from llama_index.core.retrievers import AutoMergingRetriever +from llama_index.core.schema import BaseNode +from llama_index.retrievers.bm25 import BM25Retriever +from pydantic import model_serializer + + +class VectorSimRetriever(BaseComponent, VectorIndexRetriever): + + def __init__(self, indexer, **kwargs): + BaseComponent.__init__( + self, + comp_type=CompType.RETRIEVER, + comp_subtype=RetrieverType.VECTORSIMILARITY, + ) + VectorIndexRetriever.__init__( + self, + index=indexer, + node_ids=list(indexer.index_struct.nodes_dict.values()), + callback_manager=indexer._callback_manager, + object_map=indexer._object_map, + **kwargs, + ) + # This might be a bug of llamaindex retriever. + # The node_ids will never be updated after the retriever's + # creation. However, the node_ids decides the available node + # ids to be retrieved which means the target nodes to be + # retrieved are freezed to the time of the retriever's creation. + self._node_ids = None + + def run(self, **kwargs) -> Any: + for k, v in kwargs.items(): + if k == "query": + return self.retrieve(v) + + return None + + @model_serializer + def ser_model(self): + set = { + "idx": self.idx, + "retriever_type": self.comp_subtype, + "retrieve_topk": self.similarity_top_k, + } + return set + + +class AutoMergeRetriever(BaseComponent, AutoMergingRetriever): + + def __init__(self, indexer, **kwargs): + BaseComponent.__init__( + self, + comp_type=CompType.RETRIEVER, + comp_subtype=RetrieverType.AUTOMERGE, + ) + self._index = indexer + self.topk = kwargs["similarity_top_k"] + + AutoMergingRetriever.__init__( + self, + vector_retriever=indexer.as_retriever(**kwargs), + storage_context=indexer._storage_context, + object_map=indexer._object_map, + callback_manager=indexer._callback_manager, + ) + + def run(self, **kwargs) -> Any: + for k, v in kwargs.items(): + if k == "query": + # vector_retriever needs to be updated + self._vector_retriever = self._index.as_retriever(similarity_top_k=self.topk) + return self.retrieve(v) + + return None + + +class SimpleBM25Retriever(BaseComponent): + # The nodes parameter in BM25Retriever is not from index, + # nodes in BM25Retriever can not be updated through 'indexer.insert_nodes()', + # which means nodes should be passed to BM25Retriever after data preparation stage, not init stage + + def __init__(self, indexer, **kwargs): + BaseComponent.__init__( + self, + comp_type=CompType.RETRIEVER, + comp_subtype=RetrieverType.BM25, + ) + self._docstore = indexer._docstore + self.topk = kwargs["similarity_top_k"] + + def run(self, **kwargs) -> Any: + for k, v in kwargs.items(): + if k == "query": + nodes = cast(List[BaseNode], list(self._docstore.docs.values())) + bm25_retr = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=self.topk) + return bm25_retr.retrieve(v) + + return None diff --git a/EdgeCraftRAG/edgecraftrag/context.py b/EdgeCraftRAG/edgecraftrag/context.py new file mode 100644 index 0000000000..3555ce4beb --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/context.py @@ -0,0 +1,52 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from edgecraftrag.controllers.compmgr import GeneratorMgr, IndexerMgr, NodeParserMgr, PostProcessorMgr, RetrieverMgr +from edgecraftrag.controllers.filemgr import FilelMgr +from edgecraftrag.controllers.modelmgr import ModelMgr +from edgecraftrag.controllers.nodemgr import NodeMgr +from edgecraftrag.controllers.pipelinemgr import PipelineMgr + + +class Context: + + def __init__(self): + self.plmgr = PipelineMgr() + self.nodemgr = NodeMgr() + self.npmgr = NodeParserMgr() + self.idxmgr = IndexerMgr() + self.rtvmgr = RetrieverMgr() + self.ppmgr = PostProcessorMgr() + self.modmgr = ModelMgr() + self.genmgr = GeneratorMgr() + self.filemgr = FilelMgr() + + def get_pipeline_mgr(self): + return self.plmgr + + def get_node_mgr(self): + return self.nodemgr + + def get_node_parser_mgr(self): + return self.npmgr + + def get_indexer_mgr(self): + return self.idxmgr + + def get_retriever_mgr(self): + return self.rtvmgr + + def get_postprocessor_mgr(self): + return self.ppmgr + + def get_model_mgr(self): + return self.modmgr + + def get_generator_mgr(self): + return self.genmgr + + def get_file_mgr(self): + return self.filemgr + + +ctx = Context() diff --git a/EdgeCraftRAG/edgecraftrag/controllers/__init__.py b/EdgeCraftRAG/edgecraftrag/controllers/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/controllers/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/EdgeCraftRAG/edgecraftrag/controllers/compmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/compmgr.py new file mode 100644 index 0000000000..b8dd82ab7b --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/controllers/compmgr.py @@ -0,0 +1,66 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from edgecraftrag.api_schema import IndexerIn, ModelIn, NodeParserIn +from edgecraftrag.base import BaseComponent, BaseMgr, CallbackType, ModelType, NodeParserType + + +class NodeParserMgr(BaseMgr): + + def __init__(self): + super().__init__() + + def search_parser(self, npin: NodeParserIn) -> BaseComponent: + for _, v in self.components.items(): + v_parser_type = v.comp_subtype + if v_parser_type == npin.parser_type: + if v_parser_type == NodeParserType.HIERARCHY and v.chunk_sizes == npin.chunk_sizes: + return v + elif v_parser_type == NodeParserType.SENTENCEWINDOW and v.window_size == npin.window_size: + return v + elif ( + v_parser_type == NodeParserType.SIMPLE + and v.chunk_size == npin.chunk_size + and v.chunk_overlap == npin.chunk_overlap + ): + return v + return None + + +class IndexerMgr(BaseMgr): + + def __init__(self): + super().__init__() + + def search_indexer(self, indin: IndexerIn) -> BaseComponent: + for _, v in self.components.items(): + if v.comp_subtype == indin.indexer_type: + if ( + hasattr(v, "model") + and v.model + and indin.embedding_model + and ( + (v.model.model_id_or_path == indin.embedding_model.model_id) + or (v.model.model_id_or_path == indin.embedding_model.model_path) + ) + ): + return v + return None + + +class RetrieverMgr(BaseMgr): + + def __init__(self): + super().__init__() + + +class PostProcessorMgr(BaseMgr): + + def __init__(self): + super().__init__() + + +class GeneratorMgr(BaseMgr): + + def __init__(self): + super().__init__() diff --git a/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py b/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py new file mode 100644 index 0000000000..0278f1f6ac --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/controllers/filemgr.py @@ -0,0 +1,83 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +from typing import Any, Callable, List, Optional + +from edgecraftrag.base import BaseMgr +from edgecraftrag.components.data import File +from llama_index.core.schema import Document + + +class FilelMgr(BaseMgr): + + def __init__(self): + super().__init__() + + def add_text(self, text: str): + file = File(file_name="text", content=text) + self.add(file) + return file.documents + + def add_files(self, docs: Any): + if not isinstance(docs, list): + docs = [docs] + + input_docs = [] + for doc in docs: + if not os.path.exists(doc): + continue + + if os.path.isfile(doc): + files = [doc] + elif os.path.isdir(doc): + files = [os.path.join(root, f) for root, _, files in os.walk(doc) for f in files] + else: + continue + + if not files: + continue + + for file_path in files: + file = File(file_path=file_path) + self.add(file) + input_docs.extend(file.documents) + + return input_docs + + def get_file_by_name_or_id(self, name: str): + for _, file in self.components.items(): + if file.name == name or file.idx == name: + return file + return None + + def get_files(self): + return [file for _, file in self.components.items()] + + def get_all_docs(self) -> List[Document]: + all_docs = [] + for _, file in self.components.items(): + all_docs.extend(file.documents) + return all_docs + + def get_docs_by_file(self, name) -> List[Document]: + file = self.get_file_by_name_or_id(name) + return file.documents if file else [] + + def del_file(self, name): + file = self.get_file_by_name_or_id(name) + if file: + self.remove(file.idx) + return True + else: + return False + + def update_file(self, name): + file = self.get_file_by_name_or_id(name) + if file: + self.remove(file.idx) + self.add_files(docs=name) + return True + else: + return False diff --git a/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py new file mode 100644 index 0000000000..73a77e48a8 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/controllers/modelmgr.py @@ -0,0 +1,94 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import asyncio + +from edgecraftrag.api_schema import IndexerIn, ModelIn, NodeParserIn +from edgecraftrag.base import BaseComponent, BaseMgr, CallbackType, ModelType +from edgecraftrag.components.model import OpenVINOEmbeddingModel, OpenVINOLLMModel, OpenVINORerankModel + + +class ModelMgr(BaseMgr): + + def __init__(self): + self._lock = asyncio.Lock() + super().__init__() + + def get_model_by_name(self, name: str): + for _, v in self.components.items(): + if v.model_id == name: + model_type = v.comp_subtype.value + model_info = { + "model_type": model_type, + "model_id": getattr(v, "model_id", "Unknown"), + } + if model_type == ModelType.LLM: + model_info["model_path"] = getattr(v, "model_name", "Unknown") + model_info["device"] = getattr(v, "device_map", "Unknown") + else: + model_info["model_path"] = getattr(v, "model_id_or_path", "Unknown") + model_info["device"] = getattr(v, "device", getattr(v, "_device", "Unknown")) + return model_info + return None + + def get_models(self): + model = {} + for k, v in self.components.items(): + # Supplement the information of the model + model_type = v.comp_subtype.value + model_info = { + "model_type": model_type, + "model_id": getattr(v, "model_id", "Unknown"), + } + if model_type == ModelType.LLM: + model_info["model_path"] = getattr(v, "model_name", "Unknown") + model_info["device"] = getattr(v, "device_map", "Unknown") + else: + model_info["model_path"] = getattr(v, "model_id_or_path", "Unknown") + model_info["device"] = getattr(v, "device", getattr(v, "_device", "Unknown")) + model[k] = model_info + return model + + def search_model(self, modelin: ModelIn) -> BaseComponent: + # Compare model_path and device to search model + for _, v in self.components.items(): + model_path = v.model_name if v.comp_subtype.value == "llm" else v.model_id_or_path + model_dev = ( + v.device_map + if v.comp_subtype.value == "llm" + else getattr(v, "device", getattr(v, "_device", "Unknown")) + ) + if model_path == modelin.model_path and model_dev == modelin.device: + return v + return None + + def del_model_by_name(self, name: str): + for key, v in self.components.items(): + if v and v.model_id == name: + self.remove(key) + return "Model deleted" + return "Model not found" + + @staticmethod + def load_model(model_para: ModelIn): + model = None + match model_para.model_type: + case ModelType.EMBEDDING: + model = OpenVINOEmbeddingModel( + model_id=model_para.model_id, + model_path=model_para.model_path, + device=model_para.device, + ) + case ModelType.RERANKER: + model = OpenVINORerankModel( + model_id=model_para.model_id, + model_path=model_para.model_path, + device=model_para.device, + ) + case ModelType.LLM: + model = OpenVINOLLMModel( + model_id=model_para.model_id, + model_path=model_para.model_path, + device=model_para.device, + ) + return model diff --git a/EdgeCraftRAG/edgecraftrag/controllers/nodemgr.py b/EdgeCraftRAG/edgecraftrag/controllers/nodemgr.py new file mode 100644 index 0000000000..13a41117c7 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/controllers/nodemgr.py @@ -0,0 +1,34 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import List + +from edgecraftrag.api_schema import IndexerIn, ModelIn, NodeParserIn +from edgecraftrag.base import BaseComponent, BaseMgr, CallbackType, ModelType +from llama_index.core.schema import BaseNode + + +class NodeMgr: + + def __init__(self): + self.nodes = {} + + # idx: index of node_parser + def add_nodes(self, np_idx, nodes): + if np_idx in self.nodes: + self.nodes[np_idx].append(nodes) + else: + self.nodes[np_idx] = nodes + + # TODO: to be implemented + def del_nodes(self, nodes): + pass + + def del_nodes_by_np_idx(self, np_idx): + del self.nodes[np_idx] + + def get_nodes(self, np_idx) -> List[BaseNode]: + if np_idx in self.nodes: + return self.nodes[np_idx] + else: + return [] diff --git a/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py b/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py new file mode 100644 index 0000000000..d0b8e07803 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/controllers/pipelinemgr.py @@ -0,0 +1,79 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +from typing import Any, List + +from comps.cores.proto.api_protocol import ChatCompletionRequest +from edgecraftrag.base import BaseMgr, CallbackType +from edgecraftrag.components.pipeline import Pipeline +from edgecraftrag.controllers.nodemgr import NodeMgr +from llama_index.core.schema import Document + + +class PipelineMgr(BaseMgr): + + def __init__(self): + self._active_pipeline = None + self._lock = asyncio.Lock() + super().__init__() + + def create_pipeline(self, name: str): + pl = Pipeline(name) + self.add(pl) + return pl + + def get_pipeline_by_name_or_id(self, name: str): + for _, pl in self.components.items(): + if pl.name == name or pl.idx == name: + return pl + return None + + def get_pipelines(self): + return [pl for _, pl in self.components.items()] + + def activate_pipeline(self, name: str, active: bool, nm: NodeMgr): + pl = self.get_pipeline_by_name_or_id(name) + nodelist = None + if pl is not None: + if not active: + pl.status.active = False + self._active_pipeline = None + return + if pl.node_changed: + nodelist = nm.get_nodes(pl.node_parser.idx) + pl.check_active(nodelist) + prevactive = self._active_pipeline + if prevactive: + prevactive.status.active = False + pl.status.active = True + self._active_pipeline = pl + + def get_active_pipeline(self) -> Pipeline: + return self._active_pipeline + + def notify_node_change(self): + for _, pl in self.components.items(): + pl.set_node_change() + + def run_pipeline(self, chat_request: ChatCompletionRequest) -> Any: + ap = self.get_active_pipeline() + out = None + if ap is not None: + out = ap.run(cbtype=CallbackType.PIPELINE, chat_request=chat_request) + return out + return -1 + + def run_retrieve(self, chat_request: ChatCompletionRequest) -> Any: + ap = self.get_active_pipeline() + out = None + if ap is not None: + out = ap.run(cbtype=CallbackType.RETRIEVE, chat_request=chat_request) + return out + return -1 + + def run_data_prepare(self, docs: List[Document]) -> Any: + ap = self.get_active_pipeline() + if ap is not None: + return ap.run(cbtype=CallbackType.DATAPREP, docs=docs) + return -1 diff --git a/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt new file mode 100644 index 0000000000..800d1fa2f2 --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/prompt_template/default_prompt.txt @@ -0,0 +1,8 @@ +<|im_start|>System: You are an AI assistant. Your task is to learn from the following context. Then answer the user's question based on what you learned from the context but not your own knowledge.<|im_end|> + +<|im_start|>{context}<|im_end|> + +<|im_start|>System: Pay attention to your formatting of response. If you need to reference content from context, try to keep the formatting.<|im_end|> +<|im_start|>System: Try to summarize from the context, do some reasoning before response, then response. Make sure your response is logically sound and self-consistent.<|im_end|> + +<|im_start|>{input} diff --git a/EdgeCraftRAG/edgecraftrag/requirements.txt b/EdgeCraftRAG/edgecraftrag/requirements.txt new file mode 100644 index 0000000000..3756c732ad --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/requirements.txt @@ -0,0 +1,16 @@ +docx2txt +faiss-cpu>=1.8.0.post1 +gradio>=4.44.1 +langchain-core==0.2.29 +llama-index>=0.11.0 +llama-index-embeddings-openvino>=0.4.0 +llama-index-llms-openai-like>=0.2.0 +llama-index-llms-openvino>=0.3.1 +llama-index-postprocessor-openvino-rerank>=0.3.0 +llama-index-retrievers-bm25>=0.3.0 +llama-index-vector-stores-faiss>=0.2.1 +loguru>=0.7.2 +omegaconf>=2.3.0 +opea-comps>=0.9 +py-cpuinfo>=9.0.0 +uvicorn>=0.30.6 diff --git a/EdgeCraftRAG/edgecraftrag/server.py b/EdgeCraftRAG/edgecraftrag/server.py new file mode 100644 index 0000000000..705c3f07ba --- /dev/null +++ b/EdgeCraftRAG/edgecraftrag/server.py @@ -0,0 +1,27 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +import uvicorn +from edgecraftrag.api.v1.chatqna import chatqna_app +from edgecraftrag.api.v1.data import data_app +from edgecraftrag.api.v1.model import model_app +from edgecraftrag.api.v1.pipeline import pipeline_app +from fastapi import FastAPI +from llama_index.core.settings import Settings + +app = FastAPI() + +sub_apps = [data_app, model_app, pipeline_app, chatqna_app] +for sub_app in sub_apps: + for route in sub_app.routes: + app.router.routes.append(route) + + +if __name__ == "__main__": + Settings.llm = None + + host = os.getenv("PIPELINE_SERVICE_HOST_IP", "0.0.0.0") + port = int(os.getenv("PIPELINE_SERVICE_PORT", 16010)) + uvicorn.run(app, host=host, port=port) diff --git a/EdgeCraftRAG/tests/test_pipeline_local_llm.json b/EdgeCraftRAG/tests/test_pipeline_local_llm.json new file mode 100644 index 0000000000..18895d6e50 --- /dev/null +++ b/EdgeCraftRAG/tests/test_pipeline_local_llm.json @@ -0,0 +1,41 @@ +{ + "name": "rag_test_local_llm", + "node_parser": { + "chunk_size": 400, + "chunk_overlap": 48, + "parser_type": "simple" + }, + "indexer": { + "indexer_type": "faiss_vector", + "embedding_model": { + "model_id": "BAAI/bge-small-en-v1.5", + "model_path": "./models/bge_ov_embedding", + "device": "auto" + } + }, + "retriever": { + "retriever_type": "vectorsimilarity", + "retrieve_topk": 30 + }, + "postprocessor": [ + { + "processor_type": "reranker", + "top_n": 2, + "reranker_model": { + "model_id": "BAAI/bge-reranker-large", + "model_path": "./models/bge_ov_reranker", + "device": "auto" + } + } + ], + "generator": { + "model": { + "model_id": "Qwen/Qwen2-7B-Instruct", + "model_path": "./models/qwen2-7b-instruct/INT4_compressed_weights", + "device": "cpu" + }, + "prompt_path": "./edgecraftrag/prompt_template/default_prompt.txt", + "inference_type": "local" + }, + "active": "True" +} diff --git a/EdgeCraftRAG/ui/docker/Dockerfile.ui b/EdgeCraftRAG/ui/docker/Dockerfile.ui new file mode 100644 index 0000000000..46a14a6e94 --- /dev/null +++ b/EdgeCraftRAG/ui/docker/Dockerfile.ui @@ -0,0 +1,23 @@ +FROM python:3.11-slim + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +COPY ./ui/gradio /home/user/ui +COPY ./edgecraftrag /home/user/edgecraftrag + +WORKDIR /home/user/edgecraftrag +RUN pip install --no-cache-dir -r requirements.txt + +WORKDIR /home/user/ui + +USER user + +RUN echo 'ulimit -S -n 999999' >> ~/.bashrc + +ENTRYPOINT ["python", "ecragui.py"] diff --git a/EdgeCraftRAG/ui/gradio/__init__.py b/EdgeCraftRAG/ui/gradio/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/EdgeCraftRAG/ui/gradio/assets/ai-logo-inline-ondark-3000.png b/EdgeCraftRAG/ui/gradio/assets/ai-logo-inline-ondark-3000.png new file mode 100644 index 0000000000..527b9ad94c Binary files /dev/null and b/EdgeCraftRAG/ui/gradio/assets/ai-logo-inline-ondark-3000.png differ diff --git a/EdgeCraftRAG/ui/gradio/assets/ai-logo-inline-onlight-3000.png b/EdgeCraftRAG/ui/gradio/assets/ai-logo-inline-onlight-3000.png new file mode 100644 index 0000000000..707ddd251b Binary files /dev/null and b/EdgeCraftRAG/ui/gradio/assets/ai-logo-inline-onlight-3000.png differ diff --git a/EdgeCraftRAG/ui/gradio/config.py b/EdgeCraftRAG/ui/gradio/config.py new file mode 100644 index 0000000000..477aba7c24 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/config.py @@ -0,0 +1,358 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +DEFAULT_SYSTEM_PROMPT = """\ +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. +If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\ +""" + +DEFAULT_SYSTEM_PROMPT_CHINESE = """\ +你是一个乐于助人、尊重他人以及诚实可靠的助手。在安全的情况下,始终尽可能有帮助地回答。 您的回答不应包含任何有害、不道德、种族主义、性别歧视、有毒、危险或非法的内容。请确保您的回答在社会上是公正的和积极的。 +如果一个问题没有任何意义或与事实不符,请解释原因,而不是回答错误的问题。如果您不知道问题的答案,请不要分享虚假信息。另外,答案请使用中文。\ +""" + +DEFAULT_SYSTEM_PROMPT_JAPANESE = """\ +あなたは親切で、礼儀正しく、誠実なアシスタントです。 常に安全を保ちながら、できるだけ役立つように答えてください。 回答には、有害、非倫理的、人種差別的、性差別的、有毒、危険、または違法なコンテンツを含めてはいけません。 回答は社会的に偏見がなく、本質的に前向きなものであることを確認してください。 +質問が意味をなさない場合、または事実に一貫性がない場合は、正しくないことに答えるのではなく、その理由を説明してください。 質問の答えがわからない場合は、誤った情報を共有しないでください。\ +""" + +DEFAULT_RAG_PROMPT = """\ +You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\ +""" + +DEFAULT_RAG_PROMPT_CHINESE = """\ +基于以下已知信息,请简洁并专业地回答用户的问题。如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"。不允许在答案中添加编造成分。另外,答案请使用中文。\ +""" + + +def red_pijama_partial_text_processor(partial_text, new_text): + if new_text == "<": + return partial_text + + partial_text += new_text + return partial_text.split(":")[-1] + + +def llama_partial_text_processor(partial_text, new_text): + new_text = new_text.replace("[INST]", "").replace("[/INST]", "") + partial_text += new_text + return partial_text + + +def chatglm_partial_text_processor(partial_text, new_text): + new_text = new_text.strip() + new_text = new_text.replace("[[训练时间]]", "2023年") + partial_text += new_text + return partial_text + + +def youri_partial_text_processor(partial_text, new_text): + new_text = new_text.replace("システム:", "") + partial_text += new_text + return partial_text + + +def internlm_partial_text_processor(partial_text, new_text): + partial_text += new_text + return partial_text.split("<|im_end|>")[0] + + +SUPPORTED_LLM_MODELS = { + "English": { + "tiny-llama-1b-chat": { + "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "remote_code": False, + "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}\n", + "history_template": "<|user|>\n{user} \n<|assistant|>\n{assistant} \n", + "current_message_template": "<|user|>\n{user} \n<|assistant|>\n{assistant}", + "rag_prompt_template": f"""<|system|> {DEFAULT_RAG_PROMPT }""" + + """ + <|user|> + Question: {input} + Context: {context} + Answer: + <|assistant|>""", + }, + "gemma-2b-it": { + "model_id": "google/gemma-2b-it", + "remote_code": False, + "start_message": DEFAULT_SYSTEM_PROMPT + ", ", + "history_template": "user{user}model{assistant}", + "current_message_template": "user{user}model{assistant}", + "rag_prompt_template": f"""{DEFAULT_RAG_PROMPT},""" + + """user{input}context{context}model""", + }, + "red-pajama-3b-chat": { + "model_id": "togethercomputer/RedPajama-INCITE-Chat-3B-v1", + "remote_code": False, + "start_message": "", + "history_template": "\n:{user}\n:{assistant}", + "stop_tokens": [29, 0], + "partial_text_processor": red_pijama_partial_text_processor, + "current_message_template": "\n:{user}\n:{assistant}", + "rag_prompt_template": f"""{DEFAULT_RAG_PROMPT }""" + + """ + : Question: {input} + Context: {context} + Answer: """, + }, + "gemma-7b-it": { + "model_id": "google/gemma-7b-it", + "remote_code": False, + "start_message": DEFAULT_SYSTEM_PROMPT + ", ", + "history_template": "user{user}model{assistant}", + "current_message_template": "user{user}model{assistant}", + "rag_prompt_template": f"""{DEFAULT_RAG_PROMPT},""" + + """user{input}context{context}model""", + }, + "llama-2-chat-7b": { + "model_id": "meta-llama/Llama-2-7b-chat-hf", + "remote_code": False, + "start_message": f"[INST] <>\n{DEFAULT_SYSTEM_PROMPT }\n<>\n\n", + "history_template": "{user}[/INST]{assistant}[INST]", + "current_message_template": "{user} [/INST]{assistant}", + "tokenizer_kwargs": {"add_special_tokens": False}, + "partial_text_processor": llama_partial_text_processor, + "rag_prompt_template": f"""[INST]Human: <> {DEFAULT_RAG_PROMPT }<>""" + + """ + Question: {input} + Context: {context} + Answer: [/INST]""", + }, + "mpt-7b-chat": { + "model_id": "mosaicml/mpt-7b-chat", + "remote_code": False, + "start_message": f"<|im_start|>system\n {DEFAULT_SYSTEM_PROMPT }<|im_end|>", + "history_template": "<|im_start|>user\n{user}<|im_start|>assistant\n{assistant}<|im_end|>", + "current_message_template": '"<|im_start|>user\n{user}<|im_start|>assistant\n{assistant}', + "stop_tokens": ["<|im_end|>", "<|endoftext|>"], + "rag_prompt_template": f"""<|im_start|>system + {DEFAULT_RAG_PROMPT }<|im_end|>""" + + """ + <|im_start|>user + Question: {input} + Context: {context} + Answer: <|im_start|>assistant""", + }, + "mistral-7b": { + "model_id": "mistralai/Mistral-7B-v0.1", + "remote_code": False, + "start_message": f"[INST] <>\n{DEFAULT_SYSTEM_PROMPT }\n<>\n\n", + "history_template": "{user}[/INST]{assistant}[INST]", + "current_message_template": "{user} [/INST]{assistant}", + "tokenizer_kwargs": {"add_special_tokens": False}, + "partial_text_processor": llama_partial_text_processor, + "rag_prompt_template": f""" [INST] {DEFAULT_RAG_PROMPT } [/INST] """ + + """ + [INST] Question: {input} + Context: {context} + Answer: [/INST]""", + }, + "zephyr-7b-beta": { + "model_id": "HuggingFaceH4/zephyr-7b-beta", + "remote_code": False, + "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}\n", + "history_template": "<|user|>\n{user} \n<|assistant|>\n{assistant} \n", + "current_message_template": "<|user|>\n{user} \n<|assistant|>\n{assistant}", + "rag_prompt_template": f"""<|system|> {DEFAULT_RAG_PROMPT }""" + + """ + <|user|> + Question: {input} + Context: {context} + Answer: + <|assistant|>""", + }, + "notus-7b-v1": { + "model_id": "argilla/notus-7b-v1", + "remote_code": False, + "start_message": f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}\n", + "history_template": "<|user|>\n{user} \n<|assistant|>\n{assistant} \n", + "current_message_template": "<|user|>\n{user} \n<|assistant|>\n{assistant}", + "rag_prompt_template": f"""<|system|> {DEFAULT_RAG_PROMPT }""" + + """ + <|user|> + Question: {input} + Context: {context} + Answer: + <|assistant|>""", + }, + "neural-chat-7b-v3-1": { + "model_id": "Intel/neural-chat-7b-v3-3", + "remote_code": False, + "start_message": f"[INST] <>\n{DEFAULT_SYSTEM_PROMPT }\n<>\n\n", + "history_template": "{user}[/INST]{assistant}[INST]", + "current_message_template": "{user} [/INST]{assistant}", + "tokenizer_kwargs": {"add_special_tokens": False}, + "partial_text_processor": llama_partial_text_processor, + "rag_prompt_template": f""" [INST] {DEFAULT_RAG_PROMPT } [/INST] """ + + """ + [INST] Question: {input} + Context: {context} + Answer: [/INST]""", + }, + }, + "Chinese": { + "qwen1.5-0.5b-chat": { + "model_id": "Qwen/Qwen1.5-0.5B-Chat", + "remote_code": False, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "stop_tokens": ["<|im_end|>", "<|endoftext|>"], + }, + "qwen1.5-7b-chat": { + "model_id": "Qwen/Qwen1.5-7B-Chat", + "remote_code": False, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "stop_tokens": ["<|im_end|>", "<|endoftext|>"], + "summarization_prompt_template": """ + <|im_start|>user + 问题: 总结下文内容,不少于{character_num}字. + 已知内容: {text} + 回答: <|im_end|><|im_start|>assistant""", + "split_summary_template": """ + <|im_start|>user + 问题: 根据已知内容写一篇简短的摘要. + 已知内容: {text} + 回答: <|im_end|><|im_start|>assistant""", + "combine_summary_template": """ + <|im_start|>user + 问题: 根据已知内容写一篇摘要,不少于{character_num}字. + 已知内容: {text} + 回答: <|im_end|><|im_start|>assistant""", + "rag_prompt_template": f"""<|im_start|>system + {DEFAULT_RAG_PROMPT_CHINESE }<|im_end|>""" + + """ + <|im_start|>user + 问题: {input} + 已知内容: {context} + 回答: <|im_end|><|im_start|>assistant""", + }, + "qwen-7b-chat": { + "model_id": "Qwen/Qwen-7B-Chat", + "remote_code": True, + "start_message": f"<|im_start|>system\n {DEFAULT_SYSTEM_PROMPT_CHINESE }<|im_end|>", + "history_template": "<|im_start|>user\n{user}<|im_start|>assistant\n{assistant}<|im_end|>", + "current_message_template": '"<|im_start|>user\n{user}<|im_start|>assistant\n{assistant}', + "stop_tokens": ["<|im_end|>", "<|endoftext|>"], + "revision": "2abd8e5777bb4ce9c8ab4be7dbbd0fe4526db78d", + "rag_prompt_template": f"""<|im_start|>system + {DEFAULT_RAG_PROMPT_CHINESE }<|im_end|>""" + + """ + <|im_start|>user + 问题: {input} + 已知内容: {context} + 回答: <|im_end|><|im_start|>assistant""", + }, + "qwen2-7b-instruct": { + "model_id": "Qwen/Qwen2-7B-Instruct", + "remote_code": True, + "start_message": f"<|im_start|>system\n {DEFAULT_SYSTEM_PROMPT_CHINESE }<|im_end|>", + "history_template": "<|im_start|>user\n{user}<|im_start|>assistant\n{assistant}<|im_end|>", + "current_message_template": '"<|im_start|>user\n{user}<|im_start|>assistant\n{assistant}', + "stop_tokens": ["<|im_end|>", "<|endoftext|>"], + "revision": "2abd8e5777bb4ce9c8ab4be7dbbd0fe4526db78d", + "rag_prompt_template": f"""<|im_start|>system + {DEFAULT_RAG_PROMPT_CHINESE }<|im_end|>""" + + """ + <|im_start|>user + 问题: {input} + 已知内容: {context} + 回答: <|im_end|><|im_start|>assistant""", + }, + "chatglm3-6b": { + "model_id": "THUDM/chatglm3-6b", + "remote_code": True, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "tokenizer_kwargs": {"add_special_tokens": False}, + "stop_tokens": [0, 2], + "rag_prompt_template": f"""{DEFAULT_RAG_PROMPT_CHINESE }""" + + """ + 问题: {input} + 已知内容: {context} + 回答: + """, + }, + "baichuan2-7b-chat": { + "model_id": "baichuan-inc/Baichuan2-7B-Chat", + "remote_code": True, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "tokenizer_kwargs": {"add_special_tokens": False}, + "stop_tokens": [0, 2], + "rag_prompt_template": f"""{DEFAULT_RAG_PROMPT_CHINESE }""" + + """ + 问题: {input} + 已知内容: {context} + 回答: + """, + }, + "minicpm-2b-dpo": { + "model_id": "openbmb/MiniCPM-2B-dpo-fp16", + "remote_code": True, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "stop_tokens": [2], + }, + "internlm2-chat-1.8b": { + "model_id": "internlm/internlm2-chat-1_8b", + "remote_code": True, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "stop_tokens": [2, 92542], + "partial_text_processor": internlm_partial_text_processor, + }, + "qwen1.5-1.8b-chat": { + "model_id": "Qwen/Qwen1.5-1.8B-Chat", + "remote_code": False, + "start_message": DEFAULT_SYSTEM_PROMPT_CHINESE, + "stop_tokens": ["<|im_end|>", "<|endoftext|>"], + "rag_prompt_template": f"""<|im_start|>system + {DEFAULT_RAG_PROMPT_CHINESE }<|im_end|>""" + + """ + <|im_start|>user + 问题: {input} + 已知内容: {context} + 回答: <|im_end|><|im_start|>assistant""", + }, + }, + "Japanese": { + "youri-7b-chat": { + "model_id": "rinna/youri-7b-chat", + "remote_code": False, + "start_message": f"設定: {DEFAULT_SYSTEM_PROMPT_JAPANESE}\n", + "history_template": "ユーザー: {user}\nシステム: {assistant}\n", + "current_message_template": "ユーザー: {user}\nシステム: {assistant}", + "tokenizer_kwargs": {"add_special_tokens": False}, + "partial_text_processor": youri_partial_text_processor, + }, + }, +} + +SUPPORTED_EMBEDDING_MODELS = { + "English": { + "bge-small-en-v1.5": { + "model_id": "BAAI/bge-small-en-v1.5", + "mean_pooling": False, + "normalize_embeddings": True, + }, + "bge-large-en-v1.5": { + "model_id": "BAAI/bge-large-en-v1.5", + "mean_pooling": False, + "normalize_embeddings": True, + }, + }, + "Chinese": { + "bge-small-zh-v1.5": { + "model_id": "BAAI/bge-small-zh-v1.5", + "mean_pooling": False, + "normalize_embeddings": True, + }, + "bge-large-zh-v1.5": { + "model_id": "bge-large-zh-v1.5", + "mean_pooling": False, + "normalize_embeddings": True, + }, + }, +} + + +SUPPORTED_RERANK_MODELS = { + "bge-reranker-large": {"model_id": "BAAI/bge-reranker-large"}, + "bge-reranker-base": {"model_id": "BAAI/bge-reranker-base"}, +} diff --git a/EdgeCraftRAG/ui/gradio/default.yaml b/EdgeCraftRAG/ui/gradio/default.yaml new file mode 100644 index 0000000000..1421da8f47 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/default.yaml @@ -0,0 +1,49 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Model language for LLM +model_language: "Chinese" +vector_db: "FAISS" +splitter_name: "RecursiveCharacter" +k_rerank: 5 +search_method: "similarity" +score_threshold: 0.5 +bm25_weight: 0 + +# Pipeline +name: "default" + +# Node parser +node_parser: "simple" +chunk_size: 192 +chunk_overlap: 48 + +# Indexer +indexer: "faiss_vector" + +# Retriever +retriever: "vectorsimilarity" +k_retrieval: 30 + +# Post Processor +postprocessor: "reranker" + +# Generator +generator: "local" +prompt_path: "./data/default_prompt.txt" + +# Models +embedding_model_id: "BAAI/bge-small-en-v1.5" +embedding_model_path: "./bge_ov_embedding" +# Device for embedding model inference +embedding_device: "AUTO" + +rerank_model_id: "BAAI/bge-reranker-large" +rerank_model_path: "./bge_ov_reranker" +# Device for reranking model inference +rerank_device: "AUTO" + +llm_model_id: "qwen2-7b-instruct" +llm_model_path: "./qwen2-7b-instruct/INT4_compressed_weights" +# Device for LLM model inference +llm_device: "AUTO" diff --git a/EdgeCraftRAG/ui/gradio/ecrag_client.py b/EdgeCraftRAG/ui/gradio/ecrag_client.py new file mode 100644 index 0000000000..47b5f776d7 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/ecrag_client.py @@ -0,0 +1,124 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import sys + +import requests + +sys.path.append("..") +import os + +from edgecraftrag import api_schema + +PIPELINE_SERVICE_HOST_IP = os.getenv("PIPELINE_SERVICE_HOST_IP", "127.0.0.1") +PIPELINE_SERVICE_PORT = int(os.getenv("PIPELINE_SERVICE_PORT", 16010)) +server_addr = f"http://{PIPELINE_SERVICE_HOST_IP}:{PIPELINE_SERVICE_PORT}" + + +def get_current_pipelines(): + res = requests.get(f"{server_addr}/v1/settings/pipelines", proxies={"http": None}) + pls = [] + for pl in res.json(): + if pl["status"]["active"]: + pls.append((pl["idx"], pl["name"] + " (active)")) + else: + pls.append((pl["idx"], pl["name"])) + return pls + + +def get_pipeline(name): + res = requests.get(f"{server_addr}/v1/settings/pipelines/{name}", proxies={"http": None}) + return res.json() + + +def create_update_pipeline( + name, + active, + node_parser, + chunk_size, + chunk_overlap, + indexer, + retriever, + vector_search_top_k, + postprocessor, + generator, + llm_id, + llm_device, + llm_weights, + embedding_id, + embedding_device, + rerank_id, + rerank_device, +): + req_dict = api_schema.PipelineCreateIn( + name=name, + active=active, + node_parser=api_schema.NodeParserIn( + parser_type=node_parser, chunk_size=chunk_size, chunk_overlap=chunk_overlap + ), + indexer=api_schema.IndexerIn( + indexer_type=indexer, + embedding_model=api_schema.ModelIn( + model_id=embedding_id, + # TODO: remove hardcoding + model_path="./bge_ov_embedding", + device=embedding_device, + ), + ), + retriever=api_schema.RetrieverIn(retriever_type=retriever, retriever_topk=vector_search_top_k), + postprocessor=[ + api_schema.PostProcessorIn( + processor_type=postprocessor[0], + reranker_model=api_schema.ModelIn( + model_id=rerank_id, + # TODO: remove hardcoding + model_path="./bge_ov_reranker", + device=rerank_device, + ), + ) + ], + generator=api_schema.GeneratorIn( + # TODO: remove hardcoding + prompt_path="./edgecraftrag/prompt_template/default_prompt.txt", + model=api_schema.ModelIn( + model_id=llm_id, + # TODO: remove hardcoding + model_path="./models/qwen2-7b-instruct/INT4_compressed_weights", + device=llm_device, + ), + ), + ) + # hard code only for test + print(req_dict) + res = requests.post(f"{server_addr}/v1/settings/pipelines", json=req_dict.dict(), proxies={"http": None}) + return res.text + + +def activate_pipeline(name): + active_dict = {"active": "True"} + res = requests.patch(f"{server_addr}/v1/settings/pipelines/{name}", json=active_dict, proxies={"http": None}) + status = False + restext = f"Activate pipeline {name} failed." + if res.ok: + status = True + restext = f"Activate pipeline {name} successfully." + return restext, status + + +def create_vectordb(docs, spliter, vector_db): + req_dict = api_schema.FilesIn(local_paths=docs) + res = requests.post(f"{server_addr}/v1/data/files", json=req_dict.dict(), proxies={"http": None}) + return res.text + + +def get_files(): + res = requests.get(f"{server_addr}/v1/data/files", proxies={"http": None}) + files = [] + for file in res.json(): + files.append((file["file_name"], file["file_id"])) + return files + + +def delete_file(file_name_or_id): + res = requests.delete(f"{server_addr}/v1/data/files/{file_name_or_id}", proxies={"http": None}) + return res.text diff --git a/EdgeCraftRAG/ui/gradio/ecragui.py b/EdgeCraftRAG/ui/gradio/ecragui.py new file mode 100644 index 0000000000..3c198bf2a9 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/ecragui.py @@ -0,0 +1,983 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import json +import platform +import re +from datetime import datetime +from pathlib import Path + +import cpuinfo +import distro # if running Python 3.8 or above +import ecrag_client as cli +import gradio as gr +import httpx + +# Creation of the ModelLoader instance and loading models remain the same +import platform_config as pconf +import psutil +import requests +from loguru import logger +from omegaconf import OmegaConf +from platform_config import get_available_devices, get_available_weights, get_local_available_models + +pipeline_df = [] + +import os + +MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "127.0.0.1") +MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 16011)) +UI_SERVICE_HOST_IP = os.getenv("UI_SERVICE_HOST_IP", "0.0.0.0") +UI_SERVICE_PORT = int(os.getenv("UI_SERVICE_PORT", 8084)) + + +def get_llm_model_dir(llm_model_id, weights_compression): + model_dirs = { + "fp16_model_dir": Path(llm_model_id) / "FP16", + "int8_model_dir": Path(llm_model_id) / "INT8_compressed_weights", + "int4_model_dir": Path(llm_model_id) / "INT4_compressed_weights", + } + + if weights_compression == "INT4": + model_dir = model_dirs["int4_model_dir"] + elif weights_compression == "INT8": + model_dir = model_dirs["int8_model_dir"] + else: + model_dir = model_dirs["fp16_model_dir"] + + if not model_dir.exists(): + raise FileNotFoundError(f"The model directory {model_dir} does not exist.") + elif not model_dir.is_dir(): + raise NotADirectoryError(f"The path {model_dir} is not a directory.") + + return model_dir + + +def get_system_status(): + cpu_usage = psutil.cpu_percent(interval=1) + memory_info = psutil.virtual_memory() + memory_usage = memory_info.percent + memory_total_gb = memory_info.total / (1024**3) + memory_used_gb = memory_info.used / (1024**3) + # uptime_seconds = time.time() - psutil.boot_time() + # uptime_hours, uptime_minutes = divmod(uptime_seconds // 60, 60) + disk_usage = psutil.disk_usage("/").percent + # net_io = psutil.net_io_counters() + os_info = platform.uname() + kernel_version = os_info.release + processor = cpuinfo.get_cpu_info()["brand_raw"] + dist_name = distro.name(pretty=True) + + now = datetime.now() + current_time_str = now.strftime("%Y-%m-%d %H:%M") + + status = ( + f"{current_time_str} \t" + f"CPU Usage: {cpu_usage}% \t" + f"Memory Usage: {memory_usage}% {memory_used_gb:.2f}GB / {memory_total_gb:.2f}GB \t" + # f"System Uptime: {int(uptime_hours)} hours, {int(uptime_minutes)} minutes \t" + f"Disk Usage: {disk_usage}% \t" + # f"Bytes Sent: {net_io.bytes_sent}\n" + # f"Bytes Received: {net_io.bytes_recv}\n" + f"Kernel: {kernel_version} \t" + f"Processor: {processor} \t" + f"OS: {dist_name} \n" + ) + return status + + +def build_demo(cfg, args): + + def load_chatbot_models( + llm_id, + llm_device, + llm_weights, + embedding_id, + embedding_device, + rerank_id, + rerank_device, + ): + req_dict = { + "llm_id": llm_id, + "llm_device": llm_device, + "llm_weights": llm_weights, + "embedding_id": embedding_id, + "embedding_device": embedding_device, + "rerank_id": rerank_id, + "rerank_device": rerank_device, + } + # hard code only for test + worker_addr = "http://127.0.0.1:8084" + print(req_dict) + result = requests.post(f"{worker_addr}/load", json=req_dict, proxies={"http": None}) + return result.text + + def user(message, history): + """Callback function for updating user messages in interface on submit button click. + + Params: + message: current message + history: conversation history + Returns: + None + """ + # Append the user's message to the conversation history + return "", history + [[message, ""]] + + async def bot( + history, + temperature, + top_p, + top_k, + repetition_penalty, + hide_full_prompt, + do_rag, + docs, + spliter_name, + vector_db, + chunk_size, + chunk_overlap, + vector_search_top_k, + vector_search_top_n, + run_rerank, + search_method, + score_threshold, + ): + """Callback function for running chatbot on submit button click. + + Params: + history: conversation history + temperature: parameter for control the level of creativity in AI-generated text. + By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse. + top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability. + top_k: parameter for control the range of tokens considered by the AI model based on their cumulative probability, selecting number of tokens with highest probability. + repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text. + conversation_id: unique conversation identifier. + """ + # req_dict = { + # "history": history, + # "temperature": temperature, + # "top_p": top_p, + # "top_k": top_k, + # "repetition_penalty": repetition_penalty, + # "hide_full_prompt": hide_full_prompt, + # "do_rag": do_rag, + # "docs": docs, + # "spliter_name": spliter_name, + # "vector_db": vector_db, + # "chunk_size": chunk_size, + # "chunk_overlap": chunk_overlap, + # "vector_search_top_k": vector_search_top_k, + # "vector_search_top_n": vector_search_top_n, + # "run_rerank": run_rerank, + # "search_method": search_method, + # "score_threshold": score_threshold, + # "streaming": True + # } + print(history) + new_req = {"messages": history[-1][0]} + server_addr = f"http://{MEGA_SERVICE_HOST_IP}:{MEGA_SERVICE_PORT}" + + # Async for streaming response + partial_text = "" + async with httpx.AsyncClient() as client: + async with client.stream("POST", f"{server_addr}/v1/chatqna", json=new_req, timeout=None) as response: + partial_text = "" + async for chunk in response.aiter_lines(): + new_text = chunk + if new_text.startswith("data"): + new_text = re.sub(r"\r\n", "", chunk.split("data: ")[-1]) + new_text = json.loads(chunk)["choices"][0]["message"]["content"] + partial_text = partial_text + new_text + history[-1][1] = partial_text + yield history + + avail_llms = get_local_available_models("llm") + avail_embed_models = get_local_available_models("embed") + avail_rerank_models = get_local_available_models("rerank") + avail_devices = get_available_devices() + avail_weights_compression = get_available_weights() + avail_node_parsers = pconf.get_available_node_parsers() + avail_indexers = pconf.get_available_indexers() + avail_retrievers = pconf.get_available_retrievers() + avail_postprocessors = pconf.get_available_postprocessors() + avail_generators = pconf.get_available_generators() + + css = """ + .feedback textarea {font-size: 18px; !important } + #blude_border {border: 1px solid #0000FF} + #white_border {border: 2px solid #FFFFFF} + .test textarea {color: E0E0FF; border: 1px solid #0000FF} + .disclaimer {font-variant-caps: all-small-caps} + """ + + with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo: + gr.HTML( + """ + + + + + + + +
+ + Sample Image + + + +
+

Edge Craft RAG based Q&A Chatbot

+
Powered by Intel NEXC Edge AI solutions
+
+
+ + + + """ + ) + _ = gr.Textbox( + label="System Status", + value=get_system_status, + max_lines=1, + every=1, + info="", + elem_id="white_border", + ) + + def get_pipeline_df(): + global pipeline_df + pipeline_df = cli.get_current_pipelines() + return pipeline_df + + # ------------------- + # RAG Settings Layout + # ------------------- + with gr.Tab("RAG Settings"): + with gr.Row(): + with gr.Column(scale=2): + u_pipelines = gr.Dataframe( + headers=["ID", "Name"], + column_widths=[70, 30], + value=get_pipeline_df, + label="Pipelines", + show_label=True, + interactive=False, + every=5, + ) + + u_rag_pipeline_status = gr.Textbox(label="Status", value="", interactive=False) + + with gr.Column(scale=3): + with gr.Accordion("Pipeline Configuration"): + with gr.Row(): + rag_create_pipeline = gr.Button("Create Pipeline") + rag_activate_pipeline = gr.Button("Activate Pipeline") + rag_remove_pipeline = gr.Button("Remove Pipeline") + + with gr.Column(variant="panel"): + u_pipeline_name = gr.Textbox( + label="Name", + value=cfg.name, + interactive=True, + ) + u_active = gr.Checkbox( + value=True, + label="Activated", + interactive=True, + ) + + with gr.Column(variant="panel"): + with gr.Accordion("Node Parser"): + u_node_parser = gr.Dropdown( + choices=avail_node_parsers, + label="Node Parser", + value=cfg.node_parser, + info="Select a parser to split documents.", + multiselect=False, + interactive=True, + ) + u_chunk_size = gr.Slider( + label="Chunk size", + value=cfg.chunk_size, + minimum=100, + maximum=2000, + step=50, + interactive=True, + info="Size of sentence chunk", + ) + + u_chunk_overlap = gr.Slider( + label="Chunk overlap", + value=cfg.chunk_overlap, + minimum=0, + maximum=400, + step=1, + interactive=True, + info=("Overlap between 2 chunks"), + ) + + with gr.Column(variant="panel"): + with gr.Accordion("Indexer"): + u_indexer = gr.Dropdown( + choices=avail_indexers, + label="Indexer", + value=cfg.indexer, + info="Select an indexer for indexing content of the documents.", + multiselect=False, + interactive=True, + ) + + with gr.Accordion("Embedding Model Configuration"): + u_embed_model_id = gr.Dropdown( + choices=avail_embed_models, + value=cfg.embedding_model_id, + label="Embedding Model", + # info="Select a Embedding Model", + multiselect=False, + allow_custom_value=True, + ) + + u_embed_device = gr.Dropdown( + choices=avail_devices, + value=cfg.embedding_device, + label="Embedding run device", + # info="Run embedding model on which device?", + multiselect=False, + ) + + with gr.Column(variant="panel"): + with gr.Accordion("Retriever"): + u_retriever = gr.Dropdown( + choices=avail_retrievers, + value=cfg.retriever, + label="Retriever", + info="Select a retriever for retrieving context.", + multiselect=False, + interactive=True, + ) + u_vector_search_top_k = gr.Slider( + 1, + 50, + value=cfg.k_retrieval, + step=1, + label="Search top k", + info="Number of searching results, must >= Rerank top n", + interactive=True, + ) + + with gr.Column(variant="panel"): + with gr.Accordion("Postprocessor"): + u_postprocessor = gr.Dropdown( + choices=avail_postprocessors, + value=cfg.postprocessor, + label="Postprocessor", + info="Select postprocessors for post-processing of the context.", + multiselect=True, + interactive=True, + ) + + with gr.Accordion("Rerank Model Configuration", open=True): + u_rerank_model_id = gr.Dropdown( + choices=avail_rerank_models, + value=cfg.rerank_model_id, + label="Rerank Model", + # info="Select a Rerank Model", + multiselect=False, + allow_custom_value=True, + ) + + u_rerank_device = gr.Dropdown( + choices=avail_devices, + value=cfg.rerank_device, + label="Rerank run device", + # info="Run rerank model on which device?", + multiselect=False, + ) + + with gr.Column(variant="panel"): + with gr.Accordion("Generator"): + u_generator = gr.Dropdown( + choices=avail_generators, + value=cfg.generator, + label="Generator", + info="Select a generator for AI inference.", + multiselect=False, + interactive=True, + ) + + with gr.Accordion("LLM Configuration", open=True): + u_llm_model_id = gr.Dropdown( + choices=avail_llms, + value=cfg.llm_model_id, + label="Large Language Model", + # info="Select a Large Language Model", + multiselect=False, + allow_custom_value=True, + ) + + u_llm_device = gr.Dropdown( + choices=avail_devices, + value=cfg.llm_device, + label="LLM run device", + # info="Run LLM on which device?", + multiselect=False, + ) + + u_llm_weights = gr.Radio( + avail_weights_compression, + label="Weights", + info="weights compression", + ) + + # ------------------- + # RAG Settings Events + # ------------------- + # Event handlers + def show_pipeline_detail(evt: gr.SelectData): + # get selected pipeline id + # Dataframe: {'headers': '', 'data': [[x00, x01], [x10, x11]} + # SelectData.index: [i, j] + print(u_pipelines.value["data"]) + print(evt.index) + # always use pipeline id for indexing + selected_id = pipeline_df[evt.index[0]][0] + pl = cli.get_pipeline(selected_id) + # TODO: change to json fomart + # pl["postprocessor"][0]["processor_type"] + # pl["postprocessor"]["model"]["model_id"], pl["postprocessor"]["model"]["device"] + return ( + pl["name"], + pl["status"]["active"], + pl["node_parser"]["parser_type"], + pl["node_parser"]["chunk_size"], + pl["node_parser"]["chunk_overlap"], + pl["indexer"]["indexer_type"], + pl["retriever"]["retriever_type"], + pl["retriever"]["retrieve_topk"], + pl["generator"]["generator_type"], + pl["generator"]["model"]["model_id"], + pl["generator"]["model"]["device"], + "", + pl["indexer"]["model"]["model_id"], + pl["indexer"]["model"]["device"], + ) + + def modify_create_pipeline_button(): + return "Create Pipeline" + + def modify_update_pipeline_button(): + return "Update Pipeline" + + def create_update_pipeline( + name, + active, + node_parser, + chunk_size, + chunk_overlap, + indexer, + retriever, + vector_search_top_k, + postprocessor, + generator, + llm_id, + llm_device, + llm_weights, + embedding_id, + embedding_device, + rerank_id, + rerank_device, + ): + res = cli.create_update_pipeline( + name, + active, + node_parser, + chunk_size, + chunk_overlap, + indexer, + retriever, + vector_search_top_k, + postprocessor, + generator, + llm_id, + llm_device, + llm_weights, + embedding_id, + embedding_device, + rerank_id, + rerank_device, + ) + return res, get_pipeline_df() + + # Events + u_pipelines.select( + show_pipeline_detail, + inputs=None, + outputs=[ + u_pipeline_name, + u_active, + # node parser + u_node_parser, + u_chunk_size, + u_chunk_overlap, + # indexer + u_indexer, + # retriever + u_retriever, + u_vector_search_top_k, + # postprocessor + # u_postprocessor, + # generator + u_generator, + # models + u_llm_model_id, + u_llm_device, + u_llm_weights, + u_embed_model_id, + u_embed_device, + # u_rerank_model_id, + # u_rerank_device + ], + ) + + u_pipeline_name.input(modify_create_pipeline_button, inputs=None, outputs=rag_create_pipeline) + + # Create pipeline button will change to update pipeline button if any + # of the listed fields changed + gr.on( + triggers=[ + u_active.input, + # node parser + u_node_parser.input, + u_chunk_size.input, + u_chunk_overlap.input, + # indexer + u_indexer.input, + # retriever + u_retriever.input, + u_vector_search_top_k.input, + # postprocessor + u_postprocessor.input, + # generator + u_generator.input, + # models + u_llm_model_id.input, + u_llm_device.input, + u_llm_weights.input, + u_embed_model_id.input, + u_embed_device.input, + u_rerank_model_id.input, + u_rerank_device.input, + ], + fn=modify_update_pipeline_button, + inputs=None, + outputs=rag_create_pipeline, + ) + + rag_create_pipeline.click( + create_update_pipeline, + inputs=[ + u_pipeline_name, + u_active, + u_node_parser, + u_chunk_size, + u_chunk_overlap, + u_indexer, + u_retriever, + u_vector_search_top_k, + u_postprocessor, + u_generator, + u_llm_model_id, + u_llm_device, + u_llm_weights, + u_embed_model_id, + u_embed_device, + u_rerank_model_id, + u_rerank_device, + ], + outputs=[u_rag_pipeline_status, u_pipelines], + queue=False, + ) + + rag_activate_pipeline.click( + cli.activate_pipeline, + inputs=[u_pipeline_name], + outputs=[u_rag_pipeline_status, u_active], + queue=False, + ) + + # -------------- + # Chatbot Layout + # -------------- + def get_files(): + return cli.get_files() + + def create_vectordb(docs, spliter, vector_db): + res = cli.create_vectordb(docs, spliter, vector_db) + return gr.update(value=get_files()), res + + global u_files_selected_row + u_files_selected_row = None + + def select_file(data, evt: gr.SelectData): + if not evt.selected or len(evt.index) == 0: + return "No file selected" + global u_files_selected_row + row_index = evt.index[0] + u_files_selected_row = data.iloc[row_index] + file_name, file_id = u_files_selected_row + return f"File Name: {file_name}\nFile ID: {file_id}" + + def deselect_file(): + global u_files_selected_row + u_files_selected_row = None + return gr.update(value=get_files()), "Selection cleared" + + def delete_file(): + global u_files_selected_row + if u_files_selected_row is None: + res = "Please select a file first." + else: + file_name, file_id = u_files_selected_row + u_files_selected_row = None + res = cli.delete_file(file_id) + return gr.update(value=get_files()), res + + with gr.Tab("Chatbot"): + with gr.Row(): + with gr.Column(scale=1): + docs = gr.File( + label="Step 1: Load text files", + file_count="multiple", + file_types=[ + ".csv", + ".doc", + ".docx", + ".enex", + ".epub", + ".html", + ".md", + ".odt", + ".pdf", + ".ppt", + ".pptx", + ".txt", + ], + ) + retriever_argument = gr.Accordion("Vector Store Configuration", open=False) + with retriever_argument: + spliter = gr.Dropdown( + ["Character", "RecursiveCharacter", "Markdown", "Chinese"], + value=cfg.splitter_name, + label="Text Spliter", + info="Method used to split the documents", + multiselect=False, + ) + + vector_db = gr.Dropdown( + ["FAISS", "Chroma"], + value=cfg.vector_db, + label="Vector Stores", + info="Stores embedded data and performs vector search.", + multiselect=False, + ) + load_docs = gr.Button("Upload files") + + u_files_status = gr.Textbox(label="File Processing Status", value="", interactive=False) + u_files = gr.Dataframe( + headers=["Loaded File Name", "File ID"], + value=get_files, + label="Loaded Files", + show_label=False, + interactive=False, + every=5, + ) + + with gr.Accordion("Delete File", open=False): + selected_files = gr.Textbox(label="Click file to select", value="", interactive=False) + with gr.Row(): + with gr.Column(): + delete_button = gr.Button("Delete Selected File") + with gr.Column(): + deselect_button = gr.Button("Clear Selection") + + do_rag = gr.Checkbox( + value=True, + label="RAG is ON", + interactive=True, + info="Whether to do RAG for generation", + ) + with gr.Accordion("Generation Configuration", open=False): + with gr.Row(): + with gr.Column(): + with gr.Row(): + temperature = gr.Slider( + label="Temperature", + value=0.1, + minimum=0.0, + maximum=1.0, + step=0.1, + interactive=True, + info="Higher values produce more diverse outputs", + ) + with gr.Column(): + with gr.Row(): + top_p = gr.Slider( + label="Top-p (nucleus sampling)", + value=1.0, + minimum=0.0, + maximum=1, + step=0.01, + interactive=True, + info=( + "Sample from the smallest possible set of tokens whose cumulative probability " + "exceeds top_p. Set to 1 to disable and sample from all tokens." + ), + ) + with gr.Column(): + with gr.Row(): + top_k = gr.Slider( + label="Top-k", + value=50, + minimum=0.0, + maximum=200, + step=1, + interactive=True, + info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.", + ) + with gr.Column(): + with gr.Row(): + repetition_penalty = gr.Slider( + label="Repetition Penalty", + value=1.1, + minimum=1.0, + maximum=2.0, + step=0.1, + interactive=True, + info="Penalize repetition — 1.0 to disable.", + ) + with gr.Column(scale=4): + chatbot = gr.Chatbot( + height=600, + label="Step 2: Input Query", + show_copy_button=True, + ) + with gr.Row(): + with gr.Column(): + msg = gr.Textbox( + label="QA Message Box", + placeholder="Chat Message Box", + show_label=False, + container=False, + ) + with gr.Column(): + with gr.Row(): + submit = gr.Button("Submit") + stop = gr.Button("Stop") + clear = gr.Button("Clear") + retriever_argument = gr.Accordion("Retriever Configuration", open=True) + with retriever_argument: + with gr.Row(): + with gr.Row(): + do_rerank = gr.Checkbox( + value=True, + label="Rerank searching result", + interactive=True, + ) + hide_context = gr.Checkbox( + value=True, + label="Hide searching result in prompt", + interactive=True, + ) + with gr.Row(): + search_method = gr.Dropdown( + ["similarity_score_threshold", "similarity", "mmr"], + value=cfg.search_method, + label="Searching Method", + info="Method used to search vector store", + multiselect=False, + interactive=True, + ) + with gr.Row(): + score_threshold = gr.Slider( + 0.01, + 0.99, + value=cfg.score_threshold, + step=0.01, + label="Similarity Threshold", + info="Only working for 'similarity score threshold' method", + interactive=True, + ) + with gr.Row(): + vector_rerank_top_n = gr.Slider( + 1, + 10, + value=cfg.k_rerank, + step=1, + label="Rerank top n", + info="Number of rerank results", + interactive=True, + ) + load_docs.click( + create_vectordb, + inputs=[ + docs, + spliter, + vector_db, + ], + outputs=[u_files, u_files_status], + queue=True, + ) + # TODO: Need to de-select the dataframe, + # otherwise every time the dataframe is updated, a select event is triggered + u_files.select(select_file, inputs=[u_files], outputs=selected_files, queue=True) + + delete_button.click( + delete_file, + outputs=[u_files, u_files_status], + queue=True, + ) + deselect_button.click( + deselect_file, + outputs=[u_files, selected_files], + queue=True, + ) + + submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( + bot, + [ + chatbot, + temperature, + top_p, + top_k, + repetition_penalty, + hide_context, + do_rag, + docs, + spliter, + vector_db, + u_chunk_size, + u_chunk_overlap, + u_vector_search_top_k, + vector_rerank_top_n, + do_rerank, + search_method, + score_threshold, + ], + chatbot, + queue=True, + ) + submit_click_event = submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then( + bot, + [ + chatbot, + temperature, + top_p, + top_k, + repetition_penalty, + hide_context, + do_rag, + docs, + spliter, + vector_db, + u_chunk_size, + u_chunk_overlap, + u_vector_search_top_k, + vector_rerank_top_n, + do_rerank, + search_method, + score_threshold, + ], + chatbot, + queue=True, + ) + # stop.click( + # fn=request_cancel, + # inputs=None, + # outputs=None, + # cancels=[submit_event, submit_click_event], + # queue=False, + # ) + clear.click(lambda: None, None, chatbot, queue=False) + return demo + + +def main(): + # Create the parser + parser = argparse.ArgumentParser(description="Load Embedding and LLM Models with OpenVino.") + # Add the arguments + parser.add_argument("--prompt_template", type=str, required=False, help="User specific template") + # parser.add_argument("--server_name", type=str, default="0.0.0.0") + # parser.add_argument("--server_port", type=int, default=8082) + parser.add_argument("--config", type=str, default="./default.yaml", help="configuration file path") + parser.add_argument("--share", action="store_true", help="share model") + parser.add_argument("--debug", action="store_true", help="enable debugging") + + # Execute the parse_args() method to collect command line arguments + args = parser.parse_args() + logger.info(args) + cfg = OmegaConf.load(args.config) + init_cfg_(cfg) + logger.info(cfg) + + demo = build_demo(cfg, args) + # if you are launching remotely, specify server_name and server_port + # demo.launch(server_name='your server name', server_port='server port in int') + # if you have any issue to launch on your platform, you can pass share=True to launch method: + # demo.launch(share=True) + # it creates a publicly shareable link for the interface. Read more in the docs: https://gradio.app/docs/ + # demo.launch(share=True) + demo.queue().launch( + server_name=UI_SERVICE_HOST_IP, server_port=UI_SERVICE_PORT, share=args.share, allowed_paths=["."] + ) + + # %% + # please run this cell for stopping gradio interface + demo.close() + + +def init_cfg_(cfg): + if "name" not in cfg: + cfg.name = "default" + if "embedding_device" not in cfg: + cfg.embedding_device = "CPU" + if "rerank_device" not in cfg: + cfg.rerank_device = "CPU" + if "llm_device" not in cfg: + cfg.llm_device = "CPU" + if "model_language" not in cfg: + cfg.model_language = "Chinese" + if "vector_db" not in cfg: + cfg.vector_db = "FAISS" + if "splitter_name" not in cfg: + cfg.splitter_name = "RecursiveCharacter" # or "Chinese" + if "search_method" not in cfg: + cfg.search_method = "similarity" + if "score_threshold" not in cfg: + cfg.score_threshold = 0.5 + + +if __name__ == "__main__": + main() diff --git a/EdgeCraftRAG/ui/gradio/platform_config.py b/EdgeCraftRAG/ui/gradio/platform_config.py new file mode 100644 index 0000000000..852409c1c0 --- /dev/null +++ b/EdgeCraftRAG/ui/gradio/platform_config.py @@ -0,0 +1,114 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import sys +from enum import Enum + +import openvino.runtime as ov +from config import SUPPORTED_EMBEDDING_MODELS, SUPPORTED_LLM_MODELS, SUPPORTED_RERANK_MODELS + +sys.path.append("..") +from edgecraftrag.base import GeneratorType, IndexerType, NodeParserType, PostProcessorType, RetrieverType + + +def _get_llm_model_ids(supported_models, model_language=None): + if model_language is None: + model_ids = [model_id for model_id, _ in supported_models.items()] + return model_ids + + if model_language not in supported_models: + print("Invalid model language! Please choose from the available options.") + return None + + # Create a list of model IDs based on the selected language + llm_model_ids = [ + model_id + for model_id, model_config in supported_models[model_language].items() + if model_config.get("rag_prompt_template") or model_config.get("normalize_embeddings") + ] + + return llm_model_ids + + +def _list_subdirectories(parent_directory): + """List all subdirectories under the given parent directory using os.listdir. + + Parameters: + parent_directory (str): The path to the parent directory from which to list subdirectories. + + Returns: + list: A list of subdirectory names found in the parent directory. + """ + # Get a list of all entries in the parent directory + entries = os.listdir(parent_directory) + + # Filter out the entries to only keep directories + subdirectories = [entry for entry in entries if os.path.isdir(os.path.join(parent_directory, entry))] + + return sorted(subdirectories) + + +def _get_available_models(model_ids, local_dirs): + """Filters and sorts model IDs based on their presence in the local directories. + + Parameters: + model_ids (list): A list of model IDs to check. + local_dirs (list): A list of local directory names to check against. + + Returns: + list: A sorted list of available model IDs. + """ + # Filter model_ids for those that are present in local directories + return sorted([model_id for model_id in model_ids if model_id in local_dirs]) + + +def get_local_available_models(model_type: str, local_path: str = "./"): + local_dirs = _list_subdirectories(local_path) + if model_type == "llm": + model_ids = _get_llm_model_ids(SUPPORTED_LLM_MODELS, "Chinese") + elif model_type == "embed": + model_ids = _get_llm_model_ids(SUPPORTED_EMBEDDING_MODELS, "Chinese") + elif model_type == "rerank": + model_ids = _get_llm_model_ids(SUPPORTED_RERANK_MODELS) + else: + print("Unknown model type") + avail_models = _get_available_models(model_ids, local_dirs) + return avail_models + + +def get_available_devices(): + core = ov.Core() + avail_devices = core.available_devices + ["AUTO"] + if "NPU" in avail_devices: + avail_devices.remove("NPU") + return avail_devices + + +def get_available_weights(): + avail_weights_compression = ["FP16", "INT8", "INT4"] + return avail_weights_compression + + +def get_enum_values(c: Enum): + return [v.value for k, v in vars(c).items() if not callable(v) and not k.startswith("__") and not k.startswith("_")] + + +def get_available_node_parsers(): + return get_enum_values(NodeParserType) + + +def get_available_indexers(): + return get_enum_values(IndexerType) + + +def get_available_retrievers(): + return get_enum_values(RetrieverType) + + +def get_available_postprocessors(): + return get_enum_values(PostProcessorType) + + +def get_available_generators(): + return get_enum_values(GeneratorType) diff --git a/FaqGen/Dockerfile b/FaqGen/Dockerfile index 08307f0046..4018b44d1f 100644 --- a/FaqGen/Dockerfile +++ b/FaqGen/Dockerfile @@ -19,7 +19,7 @@ WORKDIR /home/user/ RUN git clone https://github.com/opea-project/GenAIComps.git WORKDIR /home/user/GenAIComps -RUN pip install --no-cache-dir --upgrade pip && \ +RUN pip install --no-cache-dir --upgrade pip setuptools && \ pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt COPY ./faqgen.py /home/user/faqgen.py diff --git a/FaqGen/benchmark/accuracy/evaluate.py b/FaqGen/benchmark/accuracy/evaluate.py index 30998da4dd..da75502ce0 100644 --- a/FaqGen/benchmark/accuracy/evaluate.py +++ b/FaqGen/benchmark/accuracy/evaluate.py @@ -35,7 +35,7 @@ contexts.append([inputs_faq]) embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") -metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"] +metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "rubrics_score_without_reference"] metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq) test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts} diff --git a/FaqGen/benchmark/accuracy/launch_tgi.sh b/FaqGen/benchmark/accuracy/launch_tgi.sh index f4ac9eade4..1a1d23ee8d 100644 --- a/FaqGen/benchmark/accuracy/launch_tgi.sh +++ b/FaqGen/benchmark/accuracy/launch_tgi.sh @@ -11,7 +11,6 @@ docker run -it --rm \ -p $port_number:80 \ -v $volume:/data \ --runtime=habana \ - --restart always \ -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ -e HABANA_VISIBLE_DEVICES=all \ -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ @@ -20,7 +19,7 @@ docker run -it --rm \ --ipc=host \ -e HTTPS_PROXY=$https_proxy \ -e HTTP_PROXY=$https_proxy \ - ghcr.io/huggingface/tgi-gaudi:2.0.5 \ + ghcr.io/huggingface/tgi-gaudi:2.0.6 \ --model-id $model_name \ --max-input-tokens $max_input_tokens \ --max-total-tokens $max_total_tokens \ diff --git a/FaqGen/benchmark/performance/README.md b/FaqGen/benchmark/performance/README.md new file mode 100644 index 0000000000..0587a85a1e --- /dev/null +++ b/FaqGen/benchmark/performance/README.md @@ -0,0 +1,77 @@ +# FaqGen Benchmarking + +This folder contains a collection of scripts to enable inference benchmarking by leveraging a comprehensive benchmarking tool, [GenAIEval](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/README.md), that enables throughput analysis to assess inference performance. + +By following this guide, you can run benchmarks on your deployment and share the results with the OPEA community. + +## Purpose + +We aim to run these benchmarks and share them with the OPEA community for three primary reasons: + +- To offer insights on inference throughput in real-world scenarios, helping you choose the best service or deployment for your needs. +- To establish a baseline for validating optimization solutions across different implementations, providing clear guidance on which methods are most effective for your use case. +- To inspire the community to build upon our benchmarks, allowing us to better quantify new solutions in conjunction with current leading llms, serving frameworks etc. + +## Metrics + +The benchmark will report the below metrics, including: + +- Number of Concurrent Requests +- End-to-End Latency: P50, P90, P99 (in milliseconds) +- End-to-End First Token Latency: P50, P90, P99 (in milliseconds) +- Average Next Token Latency (in milliseconds) +- Average Token Latency (in milliseconds) +- Requests Per Second (RPS) +- Output Tokens Per Second +- Input Tokens Per Second + +Results will be displayed in the terminal and saved as CSV file named `1_testspec.yaml`. + +## Getting Started + +We recommend using Kubernetes to deploy the FaqGen service, as it offers benefits such as load balancing and improved scalability. However, you can also deploy the service using Docker if that better suits your needs. + +### Prerequisites + +- Install Kubernetes by following [this guide](https://github.com/opea-project/docs/blob/main/guide/installation/k8s_install/k8s_install_kubespray.md). + +- Every node has direct internet access +- Set up kubectl on the master node with access to the Kubernetes cluster. +- Install Python 3.8+ on the master node for running GenAIEval. +- Ensure all nodes have a local /mnt/models folder, which will be mounted by the pods. +- Ensure that the container's ulimit can meet the the number of requests. + +```bash +# The way to modify the containered ulimit: +sudo systemctl edit containerd +# Add two lines: +[Service] +LimitNOFILE=65536:1048576 + +sudo systemctl daemon-reload; sudo systemctl restart containerd +``` + +### Test Steps + +Please deploy FaqGen service before benchmarking. + +#### Run Benchmark Test + +Before the benchmark, we can configure the number of test queries and test output directory by: + +```bash +export USER_QUERIES="[1, 1, 1, 1]" +export TEST_OUTPUT_DIR="/tmp/benchmark_output" +``` + +And then run the benchmark by: + +```bash +bash benchmark.sh -n +``` + +The argument `-n` refers to the number of test nodes. + +#### Data collection + +All the test results will come to this folder `/tmp/benchmark_output` configured by the environment variable `TEST_OUTPUT_DIR` in previous steps. diff --git a/FaqGen/benchmark/performance/benchmark.sh b/FaqGen/benchmark/performance/benchmark.sh new file mode 100644 index 0000000000..44abdecbb1 --- /dev/null +++ b/FaqGen/benchmark/performance/benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +deployment_type="k8s" +node_number=1 +service_port=8888 +query_per_node=128 + +benchmark_tool_path="$(pwd)/GenAIEval" + +usage() { + echo "Usage: $0 [-d deployment_type] [-n node_number] [-i service_ip] [-p service_port]" + echo " -d deployment_type deployment type, select between k8s and docker (default: ${deployment_type})" + echo " -n node_number Test node number, required only for k8s deployment_type, (default: ${node_number})" + echo " -i service_ip service ip, required only for docker deployment_type" + echo " -p service_port service port, required only for docker deployment_type, (default: ${service_port})" + exit 1 +} + +while getopts ":d:n:i:p:" opt; do + case ${opt} in + d ) + deployment_type=$OPTARG + ;; + n ) + node_number=$OPTARG + ;; + i ) + service_ip=$OPTARG + ;; + p ) + service_port=$OPTARG + ;; + \? ) + echo "Invalid option: -$OPTARG" 1>&2 + usage + ;; + : ) + echo "Invalid option: -$OPTARG requires an argument" 1>&2 + usage + ;; + esac +done + +if [[ "$deployment_type" == "docker" && -z "$service_ip" ]]; then + echo "Error: service_ip is required for docker deployment_type" 1>&2 + usage +fi + +if [[ "$deployment_type" == "k8s" && ( -n "$service_ip" || -n "$service_port" ) ]]; then + echo "Warning: service_ip and service_port are ignored for k8s deployment_type" 1>&2 +fi + +function main() { + if [[ ! -d ${benchmark_tool_path} ]]; then + echo "Benchmark tool not found, setting up..." + setup_env + fi + run_benchmark +} + +function setup_env() { + git clone https://github.com/opea-project/GenAIEval.git + pushd ${benchmark_tool_path} + python3 -m venv stress_venv + source stress_venv/bin/activate + pip install -r requirements.txt + popd +} + +function run_benchmark() { + source ${benchmark_tool_path}/stress_venv/bin/activate + export DEPLOYMENT_TYPE=${deployment_type} + export SERVICE_IP=${service_ip:-"None"} + export SERVICE_PORT=${service_port:-"None"} + if [[ -z $USER_QUERIES ]]; then + user_query=$((query_per_node*node_number)) + export USER_QUERIES="[${user_query}, ${user_query}, ${user_query}, ${user_query}]" + echo "USER_QUERIES not configured, setting to: ${USER_QUERIES}." + fi + export WARMUP=$(echo $USER_QUERIES | sed -e 's/[][]//g' -e 's/,.*//') + if [[ -z $WARMUP ]]; then export WARMUP=0; fi + if [[ -z $TEST_OUTPUT_DIR ]]; then + if [[ $DEPLOYMENT_TYPE == "k8s" ]]; then + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/node_${node_number}" + else + export TEST_OUTPUT_DIR="${benchmark_tool_path}/evals/benchmark/benchmark_output/docker" + fi + echo "TEST_OUTPUT_DIR not configured, setting to: ${TEST_OUTPUT_DIR}." + fi + + envsubst < ./benchmark.yaml > ${benchmark_tool_path}/evals/benchmark/benchmark.yaml + cd ${benchmark_tool_path}/evals/benchmark + python benchmark.py +} + +main diff --git a/FaqGen/benchmark/performance/benchmark.yaml b/FaqGen/benchmark/performance/benchmark.yaml new file mode 100644 index 0000000000..2c9c914de3 --- /dev/null +++ b/FaqGen/benchmark/performance/benchmark.yaml @@ -0,0 +1,47 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +test_suite_config: # Overall configuration settings for the test suite + examples: ["faqgen"] # The specific test cases being tested, e.g., chatqna, codegen, codetrans, faqgen, audioqna, visualqna + deployment_type: "k8s" # Default is "k8s", can also be "docker" + service_ip: None # Leave as None for k8s, specify for Docker + service_port: None # Leave as None for k8s, specify for Docker + warm_ups: 0 # Number of test requests for warm-up + run_time: 60m # The max total run time for the test suite + seed: # The seed for all RNGs + user_queries: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048] # Number of test requests at each concurrency level + query_timeout: 120 # Number of seconds to wait for a simulated user to complete any executing task before exiting. 120 sec by defeult. + random_prompt: false # Use random prompts if true, fixed prompts if false + collect_service_metric: false # Collect service metrics if true, do not collect service metrics if false + data_visualization: false # Generate data visualization if true, do not generate data visualization if false + llm_model: "meta-llama/Meta-Llama-3-8B-Instruct" # The LLM model used for the test + test_output_dir: "/tmp/benchmark_output" # The directory to store the test output + load_shape: # Tenant concurrency pattern + name: constant # poisson or constant(locust default load shape) + params: # Loadshape-specific parameters + constant: # Constant load shape specific parameters, activate only if load_shape.name is constant + concurrent_level: 4 # If user_queries is specified, concurrent_level is target number of requests per user. If not, it is the number of simulated users + # arrival_rate: 1.0 # Request arrival rate. If set, concurrent_level will be overridden, constant load will be generated based on arrival-rate + poisson: # Poisson load shape specific parameters, activate only if load_shape.name is poisson + arrival_rate: 1.0 # Request arrival rate + namespace: "" # Fill the user-defined namespace. Otherwise, it will be default. + +test_cases: + faqgen: + llm: + run_test: false + service_name: "faq-tgi-svc" # Replace with your service name + parameters: + model_name: "meta-llama/Meta-Llama-3-8B-Instruct" + max_new_tokens: 128 + temperature: 0.01 + top_k: 10 + top_p: 0.95 + repetition_penalty: 1.03 + streaming: true + llmserve: + run_test: false + service_name: "faq-micro-svc" # Replace with your service name + e2e: + run_test: true + service_name: "faq-mega-server-svc" # Replace with your service name diff --git a/FaqGen/docker_compose/intel/cpu/xeon/README.md b/FaqGen/docker_compose/intel/cpu/xeon/README.md index 04fea0f859..2ed343e2ef 100644 --- a/FaqGen/docker_compose/intel/cpu/xeon/README.md +++ b/FaqGen/docker_compose/intel/cpu/xeon/README.md @@ -79,6 +79,7 @@ export TGI_LLM_ENDPOINT="http://${your_ip}:8008" export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} export MEGA_SERVICE_HOST_IP=${host_ip} export LLM_SERVICE_HOST_IP=${host_ip} +export LLM_SERVICE_PORT=9000 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/faqgen" ``` @@ -114,9 +115,11 @@ docker compose up -d 3. MegaService ```bash - curl http://${host_ip}:8888/v1/faqgen -H "Content-Type: application/json" -d '{ - "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." - }' + curl http://${host_ip}:8888/v1/faqgen \ + -H "Content-Type: multipart/form-data" \ + -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \ + -F "max_tokens=32" \ + -F "stream=false" ``` Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service. diff --git a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml index 8c5c894aed..18a6a7ec35 100644 --- a/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml +++ b/FaqGen/docker_compose/intel/cpu/xeon/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu + image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu container_name: tgi-xeon-server ports: - "8008:80" @@ -46,6 +46,7 @@ services: - http_proxy=${http_proxy} - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP} + - LLM_SERVICE_PORT=${LLM_SERVICE_PORT} ipc: host restart: always faqgen-xeon-ui-server: @@ -59,7 +60,7 @@ services: - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} + - FAQ_BASE_URL=${BACKEND_SERVICE_ENDPOINT} ipc: host restart: always networks: diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/README.md b/FaqGen/docker_compose/intel/hpu/gaudi/README.md index acdded9c2c..81473e49c2 100644 --- a/FaqGen/docker_compose/intel/hpu/gaudi/README.md +++ b/FaqGen/docker_compose/intel/hpu/gaudi/README.md @@ -11,7 +11,7 @@ First of all, you need to build Docker Images locally. This step can be ignored As TGI Gaudi has been officially published as a Docker image, we simply need to pull it: ```bash -docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 +docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 ``` ### 2. Build LLM Image @@ -28,7 +28,7 @@ To construct the Mega Service, we utilize the [GenAIComps](https://github.com/op ```bash git clone https://github.com/opea-project/GenAIExamples -cd GenAIExamples/FaqGen/docker/ +cd GenAIExamples/FaqGen/ docker build --no-cache -t opea/faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile . ``` @@ -37,7 +37,7 @@ docker build --no-cache -t opea/faqgen:latest --build-arg https_proxy=$https_pro Construct the frontend Docker image using the command below: ```bash -cd GenAIExamples/FaqGen/ +cd GenAIExamples/FaqGen/ui docker build -t opea/faqgen-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile . ``` @@ -53,7 +53,7 @@ docker build -t opea/faqgen-react-ui:latest --build-arg https_proxy=$https_proxy Then run the command `docker images`, you will have the following Docker Images: -1. `ghcr.io/huggingface/tgi-gaudi:2.0.5` +1. `ghcr.io/huggingface/tgi-gaudi:2.0.6` 2. `opea/llm-faqgen-tgi:latest` 3. `opea/faqgen:latest` 4. `opea/faqgen-ui:latest` @@ -80,6 +80,7 @@ export TGI_LLM_ENDPOINT="http://${your_ip}:8008" export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} export MEGA_SERVICE_HOST_IP=${host_ip} export LLM_SERVICE_HOST_IP=${host_ip} +export LLM_SERVICE_PORT=9000 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/faqgen" ``` @@ -115,9 +116,11 @@ docker compose up -d 3. MegaService ```bash - curl http://${host_ip}:8888/v1/faqgen -H "Content-Type: application/json" -d '{ - "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." - }' + curl http://${host_ip}:8888/v1/faqgen \ + -H "Content-Type: multipart/form-data" \ + -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \ + -F "max_tokens=32" \ + -F "stream=false" ``` ## 🚀 Launch the UI diff --git a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml index 1ee36bd302..f810319f0e 100644 --- a/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml +++ b/FaqGen/docker_compose/intel/hpu/gaudi/compose.yaml @@ -3,7 +3,7 @@ services: tgi-service: - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 container_name: tgi-gaudi-server ports: - "8008:80" @@ -56,6 +56,7 @@ services: - http_proxy=${http_proxy} - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP} - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP} + - LLM_SERVICE_PORT=${LLM_SERVICE_PORT} ipc: host restart: always faqgen-gaudi-ui-server: @@ -69,7 +70,7 @@ services: - no_proxy=${no_proxy} - https_proxy=${https_proxy} - http_proxy=${http_proxy} - - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT} + - FAQ_BASE_URL=${BACKEND_SERVICE_ENDPOINT} ipc: host restart: always diff --git a/FaqGen/faqgen.yaml b/FaqGen/faqgen.yaml index 8d354871e0..5b924a38eb 100644 --- a/FaqGen/faqgen.yaml +++ b/FaqGen/faqgen.yaml @@ -6,7 +6,7 @@ opea_micro_services: tgi-service: host: ${TGI_SERVICE_IP} ports: ${TGI_SERVICE_PORT} - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 volumes: - "./data:/data" runtime: habana diff --git a/FaqGen/kubernetes/intel/README.md b/FaqGen/kubernetes/intel/README.md index 461941b33a..42c099ef45 100644 --- a/FaqGen/kubernetes/intel/README.md +++ b/FaqGen/kubernetes/intel/README.md @@ -17,7 +17,7 @@ If use gated models, you also need to provide [huggingface token](https://huggin ## Deploy On Xeon ``` -cd GenAIExamples/FaqGen/kubernetes/intel/cpu/xeon/manifests +cd GenAIExamples/FaqGen/kubernetes/intel/cpu/xeon/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" faqgen.yaml kubectl apply -f faqgen.yaml @@ -26,7 +26,7 @@ kubectl apply -f faqgen.yaml ## Deploy On Gaudi ``` -cd GenAIExamples/FaqGen/kubernetes/intel/hpu/gaudi/manifests +cd GenAIExamples/FaqGen/kubernetes/intel/hpu/gaudi/manifest export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" faqgen.yaml kubectl apply -f faqgen.yaml diff --git a/FaqGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md b/FaqGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md index ff768c4acd..2e0ffcdb40 100644 --- a/FaqGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md +++ b/FaqGen/kubernetes/intel/cpu/xeon/manifest/README_react_ui.md @@ -16,7 +16,7 @@ Before deploying the react-faqgen.yaml file, ensure that you have the following ``` # You may set the HUGGINGFACEHUB_API_TOKEN via method: export HUGGINGFACEHUB_API_TOKEN="YourOwnToken" - cd GenAIExamples/FaqGen/kubernetes/intel/cpu/xeon/manifests/ui/ + cd GenAIExamples/FaqGen/kubernetes/intel/cpu/xeon/manifest/ui/ sed -i "s/insert-your-huggingface-token-here/${HUGGINGFACEHUB_API_TOKEN}/g" react-faqgen.yaml ``` b. Set the proxies based on your network configuration diff --git a/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml b/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml index 845ba50412..4577372495 100644 --- a/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml +++ b/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_react_ui.yaml @@ -126,7 +126,7 @@ spec: - name: no_proxy value: securityContext: {} - image: "ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu" + image: "ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu" imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /data @@ -264,7 +264,7 @@ spec: containers: - name: faqgen-react-ui env: - - name: DOC_BASE_URL + - name: FAQ_BASE_URL value: "http://faqgen:8888/v1/faqgen" - name: http_proxy value: diff --git a/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_ui.yaml b/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_ui.yaml index f74299a094..6b531a0c78 100644 --- a/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_ui.yaml +++ b/FaqGen/kubernetes/intel/cpu/xeon/manifest/faqgen_ui.yaml @@ -22,7 +22,7 @@ spec: containers: - name: faq-mega-ui-deploy env: - - name: DOC_BASE_URL + - name: FAQ_BASE_URL value: http://{insert_your_ip_here}:7779/v1/faqgen image: opea/faqgen-ui:latest imagePullPolicy: IfNotPresent diff --git a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml index 2703cbc4ef..a9b8ef199e 100644 --- a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml +++ b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml @@ -47,7 +47,7 @@ spec: value: 'true' - name: FLASH_ATTENTION_RECOMPUTE value: 'true' - image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + image: ghcr.io/huggingface/tgi-gaudi:2.0.6 imagePullPolicy: IfNotPresent securityContext: capabilities: diff --git a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen_ui.yaml b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen_ui.yaml index f74299a094..6b531a0c78 100644 --- a/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen_ui.yaml +++ b/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen_ui.yaml @@ -22,7 +22,7 @@ spec: containers: - name: faq-mega-ui-deploy env: - - name: DOC_BASE_URL + - name: FAQ_BASE_URL value: http://{insert_your_ip_here}:7779/v1/faqgen image: opea/faqgen-ui:latest imagePullPolicy: IfNotPresent diff --git a/FaqGen/tests/test_compose_on_gaudi.sh b/FaqGen/tests/test_compose_on_gaudi.sh index a583397801..dc12dfde8a 100644 --- a/FaqGen/tests/test_compose_on_gaudi.sh +++ b/FaqGen/tests/test_compose_on_gaudi.sh @@ -22,7 +22,7 @@ function build_docker_images() { service_list="faqgen faqgen-ui llm-faqgen-tgi" docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log - docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5 + docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6 docker images && sleep 1s } @@ -34,6 +34,7 @@ function start_services() { export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export MEGA_SERVICE_HOST_IP=${ip_address} export LLM_SERVICE_HOST_IP=${ip_address} + export LLM_SERVICE_PORT=9000 export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/faqgen" sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env @@ -101,13 +102,30 @@ function validate_microservices() { } function validate_megaservice() { - # Curl the Mega Service - validate_services \ - "${ip_address}:8888/v1/faqgen" \ - "Text Embeddings Inference" \ - "mega-faqgen" \ - "faqgen-gaudi-backend-server" \ - '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + local SERVICE_NAME="mega-faqgen" + local DOCKER_NAME="faqgen-gaudi-backend-server" + local EXPECTED_RESULT="Embeddings" + local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." + local URL="${ip_address}:8888/v1/faqgen" + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s } function validate_frontend() { @@ -152,7 +170,7 @@ function main() { validate_microservices validate_megaservice - validate_frontend + # validate_frontend stop_docker echo y | docker system prune diff --git a/FaqGen/tests/test_compose_on_xeon.sh b/FaqGen/tests/test_compose_on_xeon.sh index c6265e02d8..3dbde68283 100755 --- a/FaqGen/tests/test_compose_on_xeon.sh +++ b/FaqGen/tests/test_compose_on_xeon.sh @@ -34,6 +34,7 @@ function start_services() { export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export MEGA_SERVICE_HOST_IP=${ip_address} export LLM_SERVICE_HOST_IP=${ip_address} + export LLM_SERVICE_PORT=9000 export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/faqgen" sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env @@ -101,13 +102,30 @@ function validate_microservices() { } function validate_megaservice() { - # Curl the Mega Service - validate_services \ - "${ip_address}:8888/v1/faqgen" \ - "Text Embeddings Inference" \ - "mega-faqgen" \ - "faqgen-xeon-backend-server" \ - '{"messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' + local SERVICE_NAME="mega-faqgen" + local DOCKER_NAME="faqgen-xeon-backend-server" + local EXPECTED_RESULT="Embeddings" + local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." + local URL="${ip_address}:8888/v1/faqgen" + local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..." + + local CONTENT=$(curl -s -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log) + + if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then + echo "[ $SERVICE_NAME ] Content is as expected." + else + echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + else + echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log + exit 1 + fi + sleep 1s } function validate_frontend() { @@ -152,7 +170,7 @@ function main() { validate_microservices validate_megaservice - validate_frontend + # validate_frontend stop_docker echo y | docker system prune diff --git a/FaqGen/ui/docker/Dockerfile b/FaqGen/ui/docker/Dockerfile index ac2bb7da31..1d5115f4b5 100644 --- a/FaqGen/ui/docker/Dockerfile +++ b/FaqGen/ui/docker/Dockerfile @@ -23,4 +23,4 @@ RUN npm run build EXPOSE 5173 # Run the front-end application in preview mode -CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] \ No newline at end of file +CMD ["npm", "run", "preview", "--", "--port", "5173", "--host", "0.0.0.0"] diff --git a/FaqGen/ui/docker/Dockerfile.react b/FaqGen/ui/docker/Dockerfile.react index 4e29136a6d..734c6ba1dd 100644 --- a/FaqGen/ui/docker/Dockerfile.react +++ b/FaqGen/ui/docker/Dockerfile.react @@ -17,4 +17,4 @@ EXPOSE 80 COPY --from=vite-app /usr/app/react/nginx.conf /etc/nginx/conf.d/default.conf COPY --from=vite-app /usr/app/react/dist /usr/share/nginx/html -ENTRYPOINT ["nginx", "-g", "daemon off;"] \ No newline at end of file +ENTRYPOINT ["nginx", "-g", "daemon off;"] diff --git a/FaqGen/ui/react/nginx.conf b/FaqGen/ui/react/nginx.conf index 00433fcda7..01aef12751 100644 --- a/FaqGen/ui/react/nginx.conf +++ b/FaqGen/ui/react/nginx.conf @@ -17,4 +17,4 @@ server { expires 1d; } } -} \ No newline at end of file +} diff --git a/FaqGen/ui/react/public/vite.svg b/FaqGen/ui/react/public/vite.svg index e7b8dfb1b2..ee9fadaf9c 100644 --- a/FaqGen/ui/react/public/vite.svg +++ b/FaqGen/ui/react/public/vite.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/FaqGen/ui/react/src/assets/react.svg b/FaqGen/ui/react/src/assets/react.svg index 6c87de9bb3..8e0e0f15c0 100644 --- a/FaqGen/ui/react/src/assets/react.svg +++ b/FaqGen/ui/react/src/assets/react.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/FaqGen/ui/react/src/components/FaqGen/FaqGen.tsx b/FaqGen/ui/react/src/components/FaqGen/FaqGen.tsx index ca731cbf8b..74f52d3ef3 100644 --- a/FaqGen/ui/react/src/components/FaqGen/FaqGen.tsx +++ b/FaqGen/ui/react/src/components/FaqGen/FaqGen.tsx @@ -164,4 +164,4 @@ const FaqGen = () => { ) } -export default FaqGen; \ No newline at end of file +export default FaqGen; diff --git a/FaqGen/ui/react/src/components/FaqGen/FileUpload.tsx b/FaqGen/ui/react/src/components/FaqGen/FileUpload.tsx index 914ac87241..aa5d84a00f 100644 --- a/FaqGen/ui/react/src/components/FaqGen/FileUpload.tsx +++ b/FaqGen/ui/react/src/components/FaqGen/FileUpload.tsx @@ -76,4 +76,4 @@ export function FileUpload(props: Partial) { ); -} \ No newline at end of file +} diff --git a/FaqGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx b/FaqGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx index 479034cece..a21f7acc59 100644 --- a/FaqGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx +++ b/FaqGen/ui/react/src/components/Shared/CodeRender/CodeRender.tsx @@ -49,4 +49,4 @@ const CodeRender = ({ cleanCode, language, inline }:CodeRenderProps) => { } -export default CodeRender; \ No newline at end of file +export default CodeRender; diff --git a/FaqGen/ui/react/src/components/Shared/Markdown/Markdown.tsx b/FaqGen/ui/react/src/components/Shared/Markdown/Markdown.tsx index 64f624bc6e..653ac4b025 100644 --- a/FaqGen/ui/react/src/components/Shared/Markdown/Markdown.tsx +++ b/FaqGen/ui/react/src/components/Shared/Markdown/Markdown.tsx @@ -58,4 +58,4 @@ const Markdown = ({ content }: MarkdownProps) => { />) } -export default Markdown; \ No newline at end of file +export default Markdown; diff --git a/FaqGen/ui/svelte/.env b/FaqGen/ui/svelte/.env index bfdca1c9a1..4d0880c767 100644 --- a/FaqGen/ui/svelte/.env +++ b/FaqGen/ui/svelte/.env @@ -1 +1 @@ -DOC_BASE_URL = 'http://backend_address:8888/v1/faqgen' +FAQ_BASE_URL = 'http://backend_address:8888/v1/faqgen' diff --git a/FaqGen/ui/svelte/src/lib/doc.svelte b/FaqGen/ui/svelte/src/lib/doc.svelte index bae896ba3c..f9ea335843 100644 --- a/FaqGen/ui/svelte/src/lib/doc.svelte +++ b/FaqGen/ui/svelte/src/lib/doc.svelte @@ -38,8 +38,8 @@ } else { currentIdx = index; if ( - (currentIdx === 1 && message !== "") || - (currentIdx === 2 && $kb_id !== "") + (currentIdx === 2 && message !== "") || + (currentIdx === 1 && $kb_id !== "") ) { formModal = true; } else { @@ -49,10 +49,10 @@ } function panelExchange() { - if (currentIdx === 2) { + if (currentIdx === 1) { kb_id.set(""); dispatch("clearMsg", { status: true }); - } else if (currentIdx === 1) { + } else if (currentIdx === 2) { message = ""; dispatch("clearMsg", { status: true }); } @@ -152,7 +152,7 @@ type="submit" data-testid="sum-click" class="xl:my-12 inline-flex items-center px-5 py-2.5 text-sm font-medium text-center text-white bg-blue-700 mt-2 focus:ring-4 focus:ring-blue-200 dark:focus:ring-blue-900 hover:bg-blue-800" - on:click={() => generateFaq()} + on:click={() => generateFaq()} > Generate FAQs @@ -165,11 +165,11 @@ /> {#if currentIdx === 1}

- The current content will be cleared. + The currently uploaded file will be cleared.

{:else if currentIdx === 2}

- The currently uploaded file will be cleared. + The current content will be cleared.

{/if} diff --git a/FaqGen/ui/svelte/src/lib/dropFile.svelte b/FaqGen/ui/svelte/src/lib/dropFile.svelte index ef52ca1d0c..fcc972c543 100644 --- a/FaqGen/ui/svelte/src/lib/dropFile.svelte +++ b/FaqGen/ui/svelte/src/lib/dropFile.svelte @@ -15,30 +15,36 @@ --> + + diff --git a/GraphRAG/ui/react/nginx.conf b/GraphRAG/ui/react/nginx.conf new file mode 100644 index 0000000000..01aef12751 --- /dev/null +++ b/GraphRAG/ui/react/nginx.conf @@ -0,0 +1,20 @@ +server { + listen 80; + + gzip on; + gzip_proxied any; + gzip_comp_level 6; + gzip_buffers 16 8k; + gzip_http_version 1.1; + gzip_types font/woff2 text/css application/javascript application/json application/font-woff application/font-tff image/gif image/png image/svg+xml application/octet-stream; + + location / { + root /usr/share/nginx/html; + index index.html index.htm; + try_files $uri $uri/ /index.html =404; + + location ~* \.(gif|jpe?g|png|webp|ico|svg|css|js|mp4|woff2)$ { + expires 1d; + } + } +} diff --git a/GraphRAG/ui/react/package.json b/GraphRAG/ui/react/package.json new file mode 100644 index 0000000000..3760ed909b --- /dev/null +++ b/GraphRAG/ui/react/package.json @@ -0,0 +1,47 @@ +{ + "name": "ui", + "private": true, + "version": "0.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0", + "preview": "vite preview", + "test": "vitest" + }, + "dependencies": { + "@mantine/core": "^7.10.0", + "@mantine/hooks": "^7.10.0", + "@mantine/notifications": "^7.10.2", + "@microsoft/fetch-event-source": "^2.0.1", + "@reduxjs/toolkit": "^2.2.5", + "@tabler/icons-react": "^3.5.0", + "axios": "^1.7.2", + "luxon": "^3.4.4", + "react": "^18.2.0", + "react-dom": "^18.2.0", + "react-redux": "^9.1.2" + }, + "devDependencies": { + "@testing-library/react": "^16.0.0", + "@types/luxon": "^3.4.2", + "@types/node": "^20.12.12", + "@types/react": "^18.2.66", + "@types/react-dom": "^18.2.22", + "@typescript-eslint/eslint-plugin": "^7.2.0", + "@typescript-eslint/parser": "^7.2.0", + "@vitejs/plugin-react": "^4.2.1", + "eslint": "^8.57.0", + "eslint-plugin-react-hooks": "^4.6.0", + "eslint-plugin-react-refresh": "^0.4.6", + "jsdom": "^24.1.0", + "postcss": "^8.4.38", + "postcss-preset-mantine": "^1.15.0", + "postcss-simple-vars": "^7.0.1", + "sass": "1.64.2", + "typescript": "^5.2.2", + "vite": "^5.2.13", + "vitest": "^1.6.0" + } +} diff --git a/GraphRAG/ui/react/postcss.config.cjs b/GraphRAG/ui/react/postcss.config.cjs new file mode 100644 index 0000000000..e817f567be --- /dev/null +++ b/GraphRAG/ui/react/postcss.config.cjs @@ -0,0 +1,14 @@ +module.exports = { + plugins: { + "postcss-preset-mantine": {}, + "postcss-simple-vars": { + variables: { + "mantine-breakpoint-xs": "36em", + "mantine-breakpoint-sm": "48em", + "mantine-breakpoint-md": "62em", + "mantine-breakpoint-lg": "75em", + "mantine-breakpoint-xl": "88em", + }, + }, + }, +}; diff --git a/GraphRAG/ui/react/public/vite.svg b/GraphRAG/ui/react/public/vite.svg new file mode 100644 index 0000000000..ee9fadaf9c --- /dev/null +++ b/GraphRAG/ui/react/public/vite.svg @@ -0,0 +1 @@ + diff --git a/GraphRAG/ui/react/src/App.scss b/GraphRAG/ui/react/src/App.scss new file mode 100644 index 0000000000..187764a179 --- /dev/null +++ b/GraphRAG/ui/react/src/App.scss @@ -0,0 +1,42 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +@import "./styles/styles"; + +.root { + @include flex(row, nowrap, flex-start, flex-start); +} + +.layout-wrapper { + @include absolutes; + + display: grid; + + width: 100%; + height: 100%; + + grid-template-columns: 80px auto; + grid-template-rows: 1fr; +} + +/* ===== Scrollbar CSS ===== */ +/* Firefox */ +* { + scrollbar-width: thin; + scrollbar-color: #d6d6d6 #ffffff; +} + +/* Chrome, Edge, and Safari */ +*::-webkit-scrollbar { + width: 8px; +} + +*::-webkit-scrollbar-track { + background: #ffffff; +} + +*::-webkit-scrollbar-thumb { + background-color: #d6d6d6; + border-radius: 16px; + border: 4px double #dedede; +} diff --git a/GraphRAG/ui/react/src/App.tsx b/GraphRAG/ui/react/src/App.tsx new file mode 100644 index 0000000000..4be4fa5bb5 --- /dev/null +++ b/GraphRAG/ui/react/src/App.tsx @@ -0,0 +1,34 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import "./App.scss" +import { MantineProvider } from "@mantine/core" +import '@mantine/notifications/styles.css'; +import { SideNavbar, SidebarNavList } from "./components/sidebar/sidebar" +import { IconMessages } from "@tabler/icons-react" +import UserInfoModal from "./components/UserInfoModal/UserInfoModal" +import Conversation from "./components/Conversation/Conversation" +import { Notifications } from '@mantine/notifications'; + +const title = "Chat QnA" +const navList: SidebarNavList = [ + { icon: IconMessages, label: title } +] + +function App() { + + return ( + + + +
+ +
+ +
+
+
+ ) +} + +export default App diff --git a/GraphRAG/ui/react/src/__tests__/util.test.ts b/GraphRAG/ui/react/src/__tests__/util.test.ts new file mode 100644 index 0000000000..e67ba2c86a --- /dev/null +++ b/GraphRAG/ui/react/src/__tests__/util.test.ts @@ -0,0 +1,14 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, test } from "vitest"; +import { getCurrentTimeStamp, uuidv4 } from "../common/util"; + +describe("unit tests", () => { + test("check UUID is of length 36", () => { + expect(uuidv4()).toHaveLength(36); + }); + test("check TimeStamp generated is of unix", () => { + expect(getCurrentTimeStamp()).toBe(Math.floor(Date.now() / 1000)); + }); +}); diff --git a/GraphRAG/ui/react/src/assets/opea-icon-black.svg b/GraphRAG/ui/react/src/assets/opea-icon-black.svg new file mode 100644 index 0000000000..5c96dc7622 --- /dev/null +++ b/GraphRAG/ui/react/src/assets/opea-icon-black.svg @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/GraphRAG/ui/react/src/assets/opea-icon-color.svg b/GraphRAG/ui/react/src/assets/opea-icon-color.svg new file mode 100644 index 0000000000..790151171e --- /dev/null +++ b/GraphRAG/ui/react/src/assets/opea-icon-color.svg @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/GraphRAG/ui/react/src/assets/react.svg b/GraphRAG/ui/react/src/assets/react.svg new file mode 100644 index 0000000000..8e0e0f15c0 --- /dev/null +++ b/GraphRAG/ui/react/src/assets/react.svg @@ -0,0 +1 @@ + diff --git a/GraphRAG/ui/react/src/common/client.ts b/GraphRAG/ui/react/src/common/client.ts new file mode 100644 index 0000000000..7512f73e33 --- /dev/null +++ b/GraphRAG/ui/react/src/common/client.ts @@ -0,0 +1,8 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import axios from "axios"; + +//add iterceptors to add any request headers + +export default axios; diff --git a/GraphRAG/ui/react/src/common/util.ts b/GraphRAG/ui/react/src/common/util.ts new file mode 100644 index 0000000000..df65b2d8e0 --- /dev/null +++ b/GraphRAG/ui/react/src/common/util.ts @@ -0,0 +1,12 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +export const getCurrentTimeStamp = () => { + return Math.floor(Date.now() / 1000); +}; + +export const uuidv4 = () => { + return "10000000-1000-4000-8000-100000000000".replace(/[018]/g, (c) => + (+c ^ (crypto.getRandomValues(new Uint8Array(1))[0] & (15 >> (+c / 4)))).toString(16), + ); +}; diff --git a/GraphRAG/ui/react/src/components/Conversation/Conversation.tsx b/GraphRAG/ui/react/src/components/Conversation/Conversation.tsx new file mode 100644 index 0000000000..02736d8bd6 --- /dev/null +++ b/GraphRAG/ui/react/src/components/Conversation/Conversation.tsx @@ -0,0 +1,156 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +import { KeyboardEventHandler, SyntheticEvent, useEffect, useRef, useState } from 'react' +import styleClasses from "./conversation.module.scss" +import { ActionIcon, Group, Textarea, Title, rem } from '@mantine/core' +import { IconArrowRight, IconFilePlus, IconMessagePlus } from '@tabler/icons-react' +import { conversationSelector, doConversation, newConversation } from '../../redux/Conversation/ConversationSlice' +import { ConversationMessage } from '../Message/conversationMessage' +import { useAppDispatch, useAppSelector } from '../../redux/store' +import { Message, MessageRole } from '../../redux/Conversation/Conversation' +import { getCurrentTimeStamp } from '../../common/util' +import { useDisclosure } from '@mantine/hooks' +import DataSource from './DataSource' +import { ConversationSideBar } from './ConversationSideBar' + +type ConversationProps = { + title:string +} + +const Conversation = ({ title }: ConversationProps) => { + + const [prompt, setPrompt] = useState("") + const promptInputRef = useRef(null) + const [fileUploadOpened, { open: openFileUpload, close: closeFileUpload }] = useDisclosure(false); + + const { conversations, onGoingResult, selectedConversationId } = useAppSelector(conversationSelector) + const dispatch = useAppDispatch(); + const selectedConversation = conversations.find(x=>x.conversationId===selectedConversationId) + + const scrollViewport = useRef(null) + + const toSend = "Enter" + + const systemPrompt: Partial = { + role: MessageRole.System, + content: "You are helpful assistant", + }; + + + const handleSubmit = () => { + + const userPrompt: Message = { + role: MessageRole.User, + content: prompt, + time: getCurrentTimeStamp() + }; + let messages: Partial[] = []; + if(selectedConversation){ + messages = selectedConversation.Messages.map(message => { + return {role:message.role, content:message.content} + }) + } + + messages = [systemPrompt, ...messages] + + doConversation({ + conversationId: selectedConversationId, + userPrompt, + messages, + model: "Intel/neural-chat-7b-v3-3", + }) + setPrompt("") + } + + const scrollToBottom = () => { + scrollViewport.current!.scrollTo({ top: scrollViewport.current!.scrollHeight }) + } + + useEffect(() => { + scrollToBottom() + }, [onGoingResult, selectedConversation?.Messages]) + + const handleKeyDown: KeyboardEventHandler = (event) => { + if (!event.shiftKey && event.key === toSend) { + handleSubmit() + setTimeout(() => { + setPrompt("") + }, 1) + } + } + + + + const handleNewConversation = () => { + dispatch(newConversation()) + } + + const handleChange = (event: SyntheticEvent) => { + event.preventDefault() + setPrompt((event.target as HTMLTextAreaElement).value) + } + return ( +
+ +
+
+
+ {selectedConversation?.title || ""} + + + {selectedConversation && selectedConversation?.Messages.length > 0 && ( + + + + )} + + + + +
+ +
+ + {!selectedConversation && ( + <> +
Start by asking a question
+
You can also upload your Document by clicking on Document icon on top right corner
+ + )} + + {selectedConversation?.Messages.map((message) => { + return () + }) + } + + {onGoingResult && ( + + )} +
+ +
+