10000 support xpu in tuning and inference by wenhuach21 · Pull Request #481 · intel/auto-round · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

support xpu in tuning and inference #481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 50 additions & 33 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict],
else:
quantization_config = AutoQuantizationConfig.from_dict(quantization_config) # pylint: disable=E1101
quant_method = quantization_config.quant_method

# Again, we need a special care for bnb as we have a single quantization config
# class for both 4-bit and 8-bit quantization
if quant_method == QuantizationMethod.BITS_AND_BYTES:
Expand Down Expand Up @@ -170,15 +170,15 @@ def merge_quantization_configs(
warning_msg = ""

loading_attr_dict = quantization_config_from_args.get_loading_attributes() \
if quantization_config_from_args is not None else None
if quantization_config_from_args is not None else None
if isinstance(quantization_config, dict):
if "auto-round" in quantization_config["quant_method"]:
quantization_config = AutoRoundConfig.from_dict(quantization_config)
else:
if isinstance(quantization_config_from_args, (AutoRoundConfig)):
if isinstance(quantization_config_from_args, (AutoRoundConfig)):
logger.info(f"Loading quantized model in auto_round format.")
tmp_backend = quantization_config["quant_method"]
if "auto-round" not in tmp_backend and "gptq" not in tmp_backend and "awq" not in tmp_backend:
if "auto-round" not in tmp_backend and "gptq" not in tmp_backend and "awq" not in tmp_backend:
logger.error("could not convert to auto_round format, currently only supports `gptq`,`awq` or "
"`auto-round` format")
exit(-1)
Expand All @@ -187,7 +187,7 @@ def merge_quantization_configs(
target_backend = loading_attr_dict["backend"]
loading_attr_dict.pop("backend")
if "auto_round" not in target_backend:
target_backend = f"auto_round:{tmp_backend}" #
target_backend = f"auto_round:{tmp_backend}" #
quantization_config = AutoRoundConfig.from_dict(quantization_config)
setattr(quantization_config, "backend", target_backend)
else:
Expand All @@ -196,7 +196,7 @@ def merge_quantization_configs(
if isinstance(quantization_config,
(GPTQConfig, AwqConfig, AutoRoundConfig)) and quantization_config_from_args is not None:
# special case for GPTQ / AWQ config collision

for attr, val in loading_attr_dict.items():
setattr(quantization_config, attr, val)
warning_msg += (
Expand All @@ -208,7 +208,7 @@ def merge_quantization_configs(
warnings.warn(warning_msg)

return quantization_config

@staticmethod
def supports_quant_method(quantization_config_dict):
from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING
Expand All @@ -220,7 +220,7 @@ def supports_quant_method(quantization_config_dict):
quant_method = QuantizationMethod.BITS_AND_BYTES + suffix
elif quant_method is None:
raise ValueError(
"The model's quantization config from the arguments has no `quant_method` attribute."\
"The model's quantization config from the arguments has no `quant_method` attribute." \
"Make sure that the model has been correctly quantized"
)

Expand Down Expand Up @@ -287,15 +287,11 @@ def post_init(self):
raise ValueError("group_size must be greater than 0 or equal to -1")

def get_loading_attributes(self):
# attributes_dict = copy.deepcopy(self.__dict__)
loading_attibutes_dict = {"backend": self.backend}
# loading_attributes = ["backend"]
# loading_attibutes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes}
loading_attibutes_dict = {"target_backend": self.backend}
return loading_attibutes_dict

def to_dict(self):
config_dict = super().to_dict()
config_dict.pop("disable_exllama", None)
return config_dict


Expand Down Expand Up @@ -355,7 +351,18 @@ def find_backend(self, target_backend: str):
# Return None if no matching backend or alias is found
return None

def detect_auto_device(self):
if torch.cuda.is_available():
return "cuda"
elif is_hpu_supported():
return "hpu"
elif torch.xpu.is_available():
return "xpu"
else:
return "cpu"

def detect_device(self, target_backend, orig_backend):
##TODO need to refine later
"""Detects the appropriate device for the specified backend.

This function determines the device type based on the target backend. If the target backend is
Expand Down Expand Up @@ -385,29 +392,29 @@ def detect_device(self, target_backend, orig_backend):
return "cuda"
elif "hpu" in target_backend:
return "hpu"
elif "xpu" in target_backend:
return "xpu"
elif "cpu" in target_backend:
return "cpu"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about:
for device_type in ["cuda", "hpu", "xpu", "cpu"]:
if device_type in target_backend:
return device_type


# Determine the device automatically based on availability
if target_backend.split(":")[0] == "auto":
if torch.cuda.is_available():
return "cuda"
elif is_hpu_supported():
return "hpu"
else:
return "cpu"
return self.detect_auto_device()

# Find the backend and determine the device type from BackendInfos
backend = self.find_backend(target_backend)
if backend is None:
raise ValueError("Backend not found, please set it to 'auto' to have a try ")
raise ValueError("Backend is not found, please set it to 'auto' to have a try ")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set it to 'auto' and try again?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel that this sentence should not add 'is', Backend not found.


device = BackendInfos[backend].device[0]
if "cuda" in device and torch.cuda.is_available():
return device
elif "hpu" in device and is_hpu_supported():
return device
elif "xpu" in device and torch.xpu.is_available():
return device
else:
## trick
return "cpu"

def convert_model(self, model: nn.Module):
Expand Down Expand Up @@ -436,7 +443,6 @@ def convert_model(self, model: nn.Module):
if not hasattr(quantization_config, "target_backend"):
quantization_config.target_backend = quantization_config.backend

target_device = self.detect_device(quantization_config.target_backend, quantization_config.backend)
target_device = self.detect_device(None, quantization_config.backend)

self.target_device = target_device
Expand All @@ -451,13 +457,13 @@ def convert_model(self, model: nn.Module):
data_type = quantization_config.data_type if hasattr(quantization_config,
"data_type") else "int" # pragma: no cover
sym = quantization_config.sym

quant_block_list = quantization_config.quant_block_list if hasattr(quantization_config,
"quant_block_list") else None
"quant_block_list") else None

if quant_block_list is None:
to_quant_block_names = quantization_config.to_quant_block_names if hasattr(quantization_config,
"to_quant_block_names") else None
"to_quant_block_names") else None
if to_quant_block_names is not None:
if isinstance(to_quant_block_names, (list, tuple)):
quant_block_list = to_quant_block_names
Expand All @@ -474,22 +480,20 @@ def convert_model(self, model: nn.Module):
extra_config = {}
if hasattr(quantization_config, "extra_config"):
extra_config = quantization_config.extra_config
if hasattr(quantization_config, "modules_in_block_to_quantize"):##gptq format
if hasattr(quantization_config, "modules_in_block_to_quantize"): ##gptq format
modules_in_block_to_quantize_tmp = quantization_config.modules_in_block_to_quantize
modules_in_block_to_quantize = [item for sublist in modules_in_block_to_quantize_tmp for item in sublist]
for layer_name in layer_names:
quantized = False
for qname in modules_in_block_to_quantize:
if qname in layer_name:
quantized=True
quantized = True
break
if not quantized:
extra_config[layer_name]={"bits":16}
extra_config[layer_name] = {"bits": 16}
if hasattr(quantization_config, "modules_to_not_convert"):
for layer_name in quantization_config.modules_to_not_convert:
extra_config[layer_name]={"bits":16}


extra_config[layer_name] = {"bits": 16}

layer_names += extra_config.keys()
layer_names = list(set(layer_names))
Expand All @@ -514,6 +518,8 @@ def convert_model(self, model: nn.Module):
backend = quantization_config.backend
elif 'gptq' in quantization_config.quant_method: # pragma: no cover
backend = 'gptq'
elif "awq" in quantization_config.quant_method:
backend = "awq"
else: # pragma: no cover
logger.error("Please specify quantization backend")
raise ValueError("Quantization backend must be specified.")
Expand Down Expand Up @@ -670,8 +676,19 @@ def cpu_post_init(self, model):
for n, layer in tqdm(layers, desc=message, total=len(layers),
leave=True):
layer.post_init()
return model


def xpu_post_init(self, model):
message = "Repacking to XPU format"
from auto_round_extension.ipex import ipex_qlinear_classes
cpu_layers = tuple(list(ipex_qlinear_classes))
layers = [] ## ipex post_init will add one more layer
for n, m in model.named_modules():
if isinstance(m, cpu_layers):
layers.append((n, m))
for n, layer in tqdm(layers, desc=message, total=len(layers),
leave=True):
layer.post_init()
return model

def repack_marlin(self, model):
Expand Down Expand Up @@ -783,6 +800,8 @@ class StoreAttr(object):
# there are no side-effects after call qbits_post_init when model quant-type not equal to qbits.
if self.target_device == "cpu":
model = self.cpu_post_init(model)
elif self.target_device == "xpu":
model = self.xpu_post_init(model)

return model

Expand Down Expand Up @@ -816,5 +835,3 @@ def is_serializable(self):

transformers.quantizers.auto.AutoHfQuantizer = AutoHfQuantizer
transformers.modeling_utils.AutoHfQuantizer = AutoHfQuantizer


37 changes: 18 additions & 19 deletions auto_round/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def check_auto_round_exllamav2_installed():
packing_format="triton_zp+-1",
bits=[2, 3, 4, 8], group_size=None,
priority=1, feature_checks=[feature_multiply_checker_32],
alias=["auto_round:auto_gptq:cuda","auto_gptq:cuda","auto_round:gptq:cuda"],
alias=["auto_round:auto_gptq:cuda", "auto_gptq:cuda", "auto_round:gptq:cuda"],
convertable_format=["triton_zp+-1"],
requirements=["auto-gptq>=0.7.1"]
)
Expand Down Expand Up @@ -169,31 +169,31 @@ def check_auto_round_exllamav2_installed():
)

BackendInfos['auto_round:qbits_awq'] = BackendInfo(device=["cpu"], sym=[True],
packing_format="awq",
bits=[2, 4, 8], group_size=None,
priority=0 if "intel" in get_cpu_manufacturer() else 5,
feature_checks=[],
requirements=["intel-extension-for-transformers"]
)
packing_format="awq",
bits=[2, 4, 8], group_size=None,
priority=0 if "intel" in get_cpu_manufacturer() else 5,
feature_checks=[],
requirements=["intel-extension-for-transformers"]
)

BackendInfos['auto_round:ipex_gptq'] = BackendInfo(device=["cpu"], sym=[True, False],
BackendInfos['auto_round:ipex_gptq'] = BackendInfo(device=["cpu", "xpu"], sym=[True, False],
packing_format="ipex_gptq",
bits=[4], group_size=None,
priority=5 if "intel" in get_cpu_manufacturer() else 5,
feature_checks=[],
convertable_format=["triton_zp+-1"],
requirements=["intel-extension-for-pytorch>=2.4"]
requirements=["intel-extension-for-pytorch>=2.5"]
)

BackendInfos['auto_round:ipex_awq'] = BackendInfo(device=["cpu"], sym=[True, False],
packing_format="ipex_awq",
bits=[4], group_size=None,
priority=5 if "intel" in get_cpu_manufacturer() else 5,
feature_checks=[],
##convertable_format=["triton_zp+-1", "awq"],
convertable_format=["awq"],
requirements=["intel-extension-for-pytorch>=2.4"]
)
BackendInfos['auto_round:ipex_awq_xpu'] = BackendInfo(device=["cpu", "xpu"], sym=[True, False],
packing_format="ipex_awq",
bits=[4], group_size=None,
priority=1,
feature_checks=[],
##convertable_format=["triton_zp+-1", "awq"],
convertable_format=["awq"],
requirements=["intel-extension-for-pytorch>=2.6"]
)

# BackendInfos['auto_round:marlin'] = BackendInfo(device=["gpu"], sym=[True],
# packing_format="marlin",
Expand Down Expand Up @@ -554,4 +554,3 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
reverse=True)

return supported_backends[0]

8 changes: 7 additions & 1 deletion auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,8 @@ def is_valid_digit(s):
elif is_optimum_habana_available(): # pragma: no cover
device = torch.device("hpu")
# logger.info("Using HPU device")
elif torch.xpu.is_available(): # pragma: no cover
device = torch.device("xpu")
# Use CPU as a fallback
else:
device = torch.device("cpu")
Expand Down Expand Up @@ -1159,9 +1161,13 @@ def check_awq_gemm_compatibility(model, bits, group_size, sym, layer_configs=Non
def get_device_and_parallelism(device):
from auto_round.utils import detect_device
devices = device.replace(" ", "").split(',')
if all(s.isdigit() for s in devices) and len(devices) > 1:
if all(s.isdigit() for s in devices) and len(devices) > 1 and torch.cuda.is_available():
device = "cuda"
parallelism = True
elif all(s.isdigit() for s in devices) and len(devices) > 1 and torch.xpu.is_available():
device = "xpu"
parallelism = False
# pragma: no cover
elif device == "auto":
device = detect_device(device)
parallelism = True
Expand Down
Loading
Loading
0