qwen25vl-7b-gantry.sh 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #!/usr/bin/env bash
  2. set -ex
  3. # check if jq is installed
  4. if ! command -v jq &> /dev/null
  5. then
  6. echo "jq could not be found. Please install it."
  7. exit
  8. fi
  9. EXTRA_ARGS="-c olmocr/train/config/qwen25vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen25vl-pdf/v1/models/\${BEAKER_USER_ID}\""
  10. run_name=$(basename "$0" .sh)
  11. # --cluster 'ai2/jupiter*' \
  12. # --cluster 'ai2/pluto*' \
  13. # --cluster 'ai2/allennlp-cirrascale' \
  14. # --priority high \
  15. CLUSTER='jupiter'
  16. gantry run \
  17. --description "${run_name}"\
  18. --task-name "${run_name}"\
  19. --allow-dirty \
  20. --host-networking \
  21. --workspace ai2/oe-data-model-based-cleanup \
  22. --beaker-image 'jakep/jakep-pdf-finetunev1.2' \
  23. --venv 'base' \
  24. --pip gantry-requirements.txt \
  25. --priority high \
  26. --gpus 8 \
  27. --preemptible \
  28. --cluster "ai2/${CLUSTER}*" \
  29. --budget ai2/oe-data \
  30. --weka "oe-data-default:/data" \
  31. --env LOG_FILTER_TYPE=local_rank0_only \
  32. --env OMP_NUM_THREADS=8 \
  33. --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
  34. --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
  35. --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
  36. --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
  37. --shared-memory 10GiB \
  38. --yes \
  39. -- /bin/bash -c "pip install transformers==4.51.3 && source scripts/beaker/${CLUSTER}-ib.sh && python -m olmocr.train.loaddataset ${EXTRA_ARGS} && accelerate launch --use_fsdp --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"