Harmon: Harmonizing Visual Representations for Unified Multimodal Understanding and Generation

👁 Image

Harmonizing Visual Representations for Unified Multimodal Understanding and Generation

Size Wu, Wenwei Zhang, Lumin Xu, Sheng Jin, Zhonghua Wu, Qingyi Tao, Wentao Liu, Wei Li, Chen Change Loy

👁 arXiv
👁 Project Page
👁 GitHub
👁 Bibtex

Introduction

Harmon is a novel unified framework for multimodal understanding and generation. Unlike existing state-of-the-art architectures that disentangle visual understanding and generation with different encoder models, the proposed framework harmonizes the visual presentations of understanding and generation via a shared MAR encoder. Harmon achieves advanced generation performance on mainstream text-to-image generation benchmarks, and exhibits competitive results on multimodal understanding tasks. In this repo, we provide inference code to run Harmon for image understanding (image-to-text) and text-to-image generation, with two model variants Harmon-0.5B and Harmon-1.5B.

Model Variant	LLM	MAR	Hugging Face Hub
Harmon-0.5B	Qwen2.5-0.5B-Instruct	MAR-Base	👁 Hugging Face
Harmon-1.5B	Qwen2.5-1.5B-Instruct	MAR-Huge	👁 Hugging Face

Usage

🖌️ Image-to-text Generation

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
from PIL import Image
import requests


PROMPT_TEMPLATE = dict(
 SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
 INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
 SUFFIX='<|im_end|>',
 SUFFIX_AS_EOS=True,
 SEP='\n',
 STOP_WORDS=['<|im_end|>', '<|endoftext|>'])


def expand2square(pil_img, background_color):
 width, height = pil_img.size
 if width == height:
 return pil_img
 elif width > height:
 result = Image.new(pil_img.mode, (width, width), background_color)
 result.paste(pil_img, (0, (width - height) // 2))
 return result
 else:
 result = Image.new(pil_img.mode, (height, height), background_color)
 result.paste(pil_img, ((height - width) // 2, 0))
 return result


@torch.no_grad()
def question_answer(question,
 image,
 model,
 tokenizer,
 max_new_tokens=512,
 image_size=512
):
 assert image_size == 512
 image = expand2square(
 image, (127, 127, 127))
 image = image.resize(size=(image_size, image_size))
 image = torch.from_numpy(np.array(image)).to(dtype=model.dtype, device=model.device)
 image = rearrange(image, 'h w c -> c h w')[None]
 image = 2 * (image / 255) - 1

 prompt = PROMPT_TEMPLATE['INSTRUCTION'].format(input="<image>\n" + question)
 assert '<image>' in prompt
 image_length = (image_size // 16) ** 2 + model.mar.buffer_size
 prompt = prompt.replace('<image>', '<image>'*image_length)
 input_ids = tokenizer.encode(
 prompt, add_special_tokens=True, return_tensors='pt').cuda()
 _, z_enc = model.extract_visual_feature(model.encode(image))
 inputs_embeds = z_enc.new_zeros(*input_ids.shape, model.llm.config.hidden_size)
 inputs_embeds[input_ids == image_token_idx] = z_enc.flatten(0, 1)
 inputs_embeds[input_ids != image_token_idx] = model.llm.get_input_embeddings()(
 input_ids[input_ids != image_token_idx]
 )
 output = model.llm.generate(inputs_embeds=inputs_embeds,
 use_cache=True,
 do_sample=False,
 max_new_tokens=max_new_tokens,
 eos_token_id=tokenizer.eos_token_id,
 pad_token_id=tokenizer.pad_token_id
 if tokenizer.pad_token_id is not None else
 tokenizer.eos_token_id
 )
 return tokenizer.decode(output[0])


harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-1_5B",
 trust_remote_code=True)
harmon_model = AutoModel.from_pretrained("wusize/Harmon-1_5B",
 trust_remote_code=True).eval().cuda().bfloat16()

special_tokens_dict = {'additional_special_tokens': ["<image>", ]}
num_added_toks = harmon_tokenizer.add_special_tokens(special_tokens_dict)
assert num_added_toks == 1

image_token_idx = harmon_tokenizer.encode("<image>", add_special_tokens=False)[-1]
print(f"Image token: {harmon_tokenizer.decode(image_token_idx)}")

image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw).convert('RGB')

output_text = question_answer(question='Describe the image in detail.',
 image=raw_image,
 model=harmon_model,
 tokenizer=harmon_tokenizer,
 )

print(output_text)

🖼️ Text-to-image Generation

import os
import torch
from transformers import AutoTokenizer, AutoModel
from einops import rearrange
from PIL import Image


PROMPT_TEMPLATE = dict(
 SYSTEM='<|im_start|>system\n{system}<|im_end|>\n',
 INSTRUCTION='<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n',
 SUFFIX='<|im_end|>',
 SUFFIX_AS_EOS=True,
 SEP='\n',
 STOP_WORDS=['<|im_end|>', '<|endoftext|>'])

GENERATION_TEMPLATE = "Generate an image: {text}"


@torch.no_grad()
def generate_images(prompts,
 negative_prompt,
 tokenizer,
 model,
 output,
 grid_size=2, # will produce 2 x 2 images per prompt
 num_steps=64, cfg_scale=3.0, temperature=1.0, image_size=512):
 assert image_size == 512
 m = n = image_size // 16

 prompts = [
 PROMPT_TEMPLATE['INSTRUCTION'].format(input=prompt)
 for prompt in prompts
 ] * (grid_size ** 2)

 if cfg_scale != 1.0:
 prompts += [PROMPT_TEMPLATE['INSTRUCTION'].format(input=negative_prompt)] * len(prompts)

 inputs = tokenizer(
 prompts, add_special_tokens=True, return_tensors='pt', padding=True).to(model.device)

 images = model.sample(**inputs, num_iter=num_steps, cfg=cfg_scale, cfg_schedule="constant",
 temperature=temperature, progress=True, image_shape=(m, n))
 images = rearrange(images, '(m n b) c h w -> b (m h) (n w) c', m=grid_size, n=grid_size)

 images = torch.clamp(
 127.5 * images + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()

 os.makedirs(output, exist_ok=True)
 for idx, image in enumerate(images):
 Image.fromarray(image).save(f"{output}/{idx:08d}.jpg")


harmon_tokenizer = AutoTokenizer.from_pretrained("wusize/Harmon-1_5B",
 trust_remote_code=True)
harmon_model = AutoModel.from_pretrained("wusize/Harmon-1_5B",
 trust_remote_code=True).cuda().bfloat16().eval()


texts = ['a dog on the left and a cat on the right.',
 'a photo of a pink stop sign.']
pos_prompts = [GENERATION_TEMPLATE.format(text=text) for text in texts]
neg_prompt = 'Generate an image.' # for classifier-free guidance


generate_images(prompts=pos_prompts,
 negative_prompt=neg_prompt,
 tokenizer=harmon_tokenizer,
 model=harmon_model,
 output='output',)

📚 Citation

If you find Harmon useful for your research or applications, please cite our paper using the following BibTeX:

@misc{wu2025harmon,
 title={Harmonizing Visual Representations for Unified Multimodal Understanding and Generation}, 
 author={Size Wu and Wenwei Zhang and Lumin Xu and Sheng Jin and Zhonghua Wu and Qingyi Tao and Wentao Liu and Wei Li and Chen Change Loy},
 year={2025},
 eprint={2503.21979},
 archivePrefix={arXiv},
 primaryClass={cs.CV},
 url={https://arxiv.org/abs/2503.21979}, 
}

📜 License

This project is licensed under NTU S-Lab License 1.0.

Downloads last month: 37

Safetensors

Model size

3B params

Tensor type

BF16

Model tree for wusize/Harmon-1_5B

Finetunes

2 models

Spaces using wusize/Harmon-1_5B 2

Paper for wusize/Harmon-1_5B

Paper • 2503.21979 • Published Mar 27, 2025 • 4

URL: https://huggingface.co/wusize/Harmon-1_5B

⇱ wusize/Harmon-1_5B · Hugging Face