Granite-3.1-Earthen-v0.3-3B-A800M

ibm-granite/granite-3.1-3b-a800m-instruct was trained at 8K with batch size 2 gradient accumulation 8, so each step was 131,072 tokens (including any padding tokens). It was trained for 400 steps, adding up to a total of 52,428,800 unique tokens seen.

This is a small test run. A larger version is planned.

Quants

GGUF

Prompt Format

This model uses Granite-3.1 Instruct format.

<|start_of_role|>system<|end_of_role|>example system prompt<|end_of_text|>
<|start_of_role|>user<|end_of_role|>example user turn 1<|end_of_text|>
<|start_of_role|>assistant<|end_of_role|>example assistant turn 1<|end_of_text|>
<|start_of_role|>user<|end_of_role|>example user turn 2<|end_of_text|>
<|start_of_role|>assistant<|end_of_role|>example assistant turn 2<|end_of_text|>

Training Details

👁 Built with Axolotl

# Requirements before running
# - Get latest commit of axolotl (currently c0a0c75)
# - Download these to axolotl/src/axolotl/prompt_formatters
# - https://github.com/xzuyn/axolotl/blob/came-plus-formatters/src/axolotl/prompt_strategies/formatter_regex.py
# - https://github.com/xzuyn/axolotl/blob/came-plus-formatters/src/axolotl/prompt_strategies/customcompletion-regex.py
# - https://github.com/xzuyn/axolotl/blob/came-plus-formatters/src/axolotl/prompt_strategies/customgranite-regex.py
# - pip install ftfy
# - pip install git+https://github.com/xzuyn/CAME.git@sr-grams-cautious-8bit

# Weights and Biases logging config
wandb_project: Granite-3.1-3B-A800M
wandb_name: Granite-3.1-Earthen-v0.3-3B-A800M-QLoRA-run4

# Model checkpointing config
output_dir: ./Outputs/Granite-3.1-Earthen-v0.3-3B-A800M-QLoRA-run4
resume_from_checkpoint:
save_steps: 10
save_safetensors: true
save_total_limit: 2
save_only_model: false

# Model architecture config
base_model: ibm-granite/granite-3.1-3b-a800m-instruct
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

# Mixed precision training config
bf16: true
fp16: false
tf32: false

# Model loading config
load_in_8bit: false
load_in_4bit: true
strict: false

# Sequence config
sequence_len: 8192
min_sample_len: 256
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true
train_on_inputs: false
group_by_length: false

# LoRA adapter config
adapter: qlora
lora_r: 128
lora_alpha: 128
lora_dropout: 0.125
lora_target_linear: true
embeddings_skip_upcast: true

# Dataset config
datasets:
# Completion
 # Story-like Data
 - path: BeaverAI/REDACTED1
 split: train[:4000]
 type: customcompletion-regex
 - path: PJMixers-Dev/Lit-axo-Shuffled
 split: train[:4000]
 type: customcompletion-regex
 - path: PJMixers-Dev/Mielikki_Erebus-87k-axo
 split: train[:4000]
 type: customcompletion-regex
 - path: PJMixers/RyokoAI_Honeyfeed3600-Cleanish
 split: train[:4000]
 type: customcompletion-regex
 - path: BeaverAI/REDACTED2
 type: customcompletion-regex
 - path: PJMixers-Dev/allura-org_fujin-cleaned-stage-2-axo
 split: train[:4000]
 type: customcompletion-regex
 - path: Nelathan/synthetic-sugar-quill
 split: train[:4000]
 type: customcompletion-regex
 - path: PJMixers-Dev/winglian_visual-novels-json-axo-dropped-long
 split: train[:4000]
 type: customcompletion-regex
 - path: BeaverAI/REDACTED3
 type: customcompletion-regex
 - path: PJMixers-Dev/recursal_SCP-RECURSAL-Cleaned
 split: train[:4000]
 type: customcompletion-regex
 # Subtitle Data
 - path: PJMixers-Dev/Subtitles
 type: customcompletion-regex
 - path: PJMixers-Dev/KaraKaraWitch_AnimeSubtitle-axo
 split: train[:4000]
 type: customcompletion-regex
 # News Data
 - path: PJMixers/AP-News-2024
 type: customcompletion-regex
 - path: PJMixers-Dev/Fundus-AP-News-Formatted
 split: train[:4000]
 type: customcompletion-regex
 - path: PJMixers-Dev/Fundus-AP-News-2-Formatted
 type: customcompletion-regex
 # Misc Data
 - path: PJMixers-Dev/goodwiki-2024-12-04-axo
 split: train[:4000]
 type: customcompletion-regex
 - path: epfl-llm/guidelines
 split: train[:4000]
 field: clean_text
 type: customcompletion-regex
# Granite-3.1 Instruct
 # Instruction Data
 - path: PJMixers-Dev/allenai_tulu-3-sft-mixture-filtered-2-ShareGPT
 split: train[:4000]
 type: customgranite-regex
 - path: OpenLeecher/lmsys_chat_1m_clean
 split: train[:4000]
 type: customgranite-regex
 # RP Data
 - path: PJMixers-Dev/Gryphe-Aesir-RPG-Charcards-Opus-Mixed
 type: customgranite-regex
 - path: allura-org/gryphe-sonnet-3.5-charcards-names-added
 type: customgranite-regex
 - path: anthracite-org/c2_logs_32k_llama3_qwen2_v1.3
 type: customgranite-regex
 - path: BeaverAI/REDACTED4
 type: customgranite-regex
 - path: PJMixers-Dev/MinervaAI_Aesir-Preview-Anon
 type: customgranite-regex
 - path: PJMixers-Dev/lemonilia_LimaRP-Simple-CustomShareGPT-Shuffled
 type: customgranite-regex
 - path: Epiculous/SynthRP-Gens-v1.1-Filtered-n-Cleaned
 type: customgranite-regex
 - path: PJMixers-Dev/NyxKrage_chub-logs-sharegpt-longest-CustomShareGPT
 type: customgranite-regex
 - path: PJMixers/OpenLeecher_Teatime_all_logs_longest-ShareGPT
 type: customgranite-regex
 - path: grimulkan/aicg-logs-augmented
 type: customgranite-regex
 - path: grimulkan/PIPPA-augmented-dedup
 type: customgranite-regex
 - path: PJMixers/grimulkan_bluemoon_Karen_cleaned-carded-formatted
 type: customgranite-regex
 # InstStory Data
 - path: PJMixers/lodrick-the-lafted_OpusStories-ShareGPT
 type: customgranite-regex
 - path: Gryphe/ChatGPT-4o-Writing-Prompts
 type: customgranite-regex
 - path: Gryphe/Opus-WritingPrompts
 type: customgranite-regex
 - path: anthracite-org/nopm_claude_writing_fixed
 type: customgranite-regex
 - path: PJMixers-Dev/Tiefighter-13B-Fake-Distill-ShareGPT
 type: customgranite-regex
 - path: allura-org/fujin-instruct-v2
 type: customgranite-regex
 - path: ToastyPigeon/gutenberg-sft
 type: customgranite-regex
 # Adventure Data
 - path: PocketDoc/Dans-Prosemaxx-Adventure
 type: customgranite-regex
 - path: PocketDoc/Dans-Failuremaxx-Adventure-3
 type: customgranite-regex
 # Decensoring Data
 - path: TheDrummer/AmoralQA-v2
 type: customgranite-regex
 - path: BeaverAI/REDACTED5
 type: customgranite-regex
 - path: BeaverAI/REDACTED6
 type: customgranite-regex
val_set_size: 256
eval_strategy: steps
eval_steps: 10
dataset_prepared_path: ./00-Tokenized-Datasets/Granite-3.1-Earthen-v0.3-3B-A800M-LoRA-seed42
shuffle_merged_datasets: true

# Training hyperparameters
num_epochs: 1
gradient_accumulation_steps: 8
micro_batch_size: 2
eval_batch_size: 2
warmup_steps: 0
optimizer: came_pytorch
optim_args:
 enable_stochastic_rounding: true
 enable_cautious: true
 enable_8bit: true
lr_scheduler: rex
learning_rate: 2.5e-7
cosine_min_lr_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 0.5
logging_steps: 1

# Model optimization
gradient_checkpointing: offload
sdp_attention: true
plugins:
 - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: true
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

# Debug config
debug: true
seed: 42

# Token config
special_tokens:
 bos_token: "<|end_of_text|>"
 eos_token: "<|end_of_text|>"
 pad_token: "<|end_of_text|>"
tokens: