|
|
|
@ -1420,6 +1420,34 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
|
|
|
|
|
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
|
|
|
[GGML_TYPE_Q4_0] = {
|
|
|
|
|
.dequantize_row_q = dequantize_row_q4_0,
|
|
|
|
|
.quantize_row_q = quantize_row_q4_0,
|
|
|
|
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
|
|
|
|
.quantize_row_q_dot = quantize_row_q8_0,
|
|
|
|
|
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
|
|
|
|
|
},
|
|
|
|
|
[GGML_TYPE_Q4_1] = {
|
|
|
|
|
.dequantize_row_q = dequantize_row_q4_1,
|
|
|
|
|
.quantize_row_q = quantize_row_q4_1,
|
|
|
|
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
|
|
|
|
.quantize_row_q_dot = quantize_row_q4_1,
|
|
|
|
|
.vec_dot_q = ggml_vec_dot_q4_1,
|
|
|
|
|
},
|
|
|
|
|
// TODO: GGML_TYPE_Q8_0
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// For internal test use
|
|
|
|
|
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
|
|
|
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
|
|
|
|
return quantize_fns[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
|
// simd mappings
|
|
|
|
|
//
|
|
|
|
@ -5588,6 +5616,26 @@ static void ggml_compute_forward_dup_f16(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
|
|
|
|
|
quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
|
|
|
|
|
size_t id = 0;
|
|
|
|
|
uint8_t * dst_ptr = (uint8_t *) dst->data;
|
|
|
|
|
size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
|
|
|
|
float * src0_f32 = (float *) params->wdata;
|
|
|
|
|
|
|
|
|
|
for (int i03 = 0; i03 < ne03; i03++) {
|
|
|
|
|
for (int i02 = 0; i02 < ne02; i02++) {
|
|
|
|
|
for (int i01 = 0; i01 < ne01; i01++) {
|
|
|
|
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
|
|
|
// convert to f32 and quantize
|
|
|
|
|
for (int i00 = 0; i00 < ne00; i00++) {
|
|
|
|
|
src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
|
|
|
|
|
}
|
|
|
|
|
quantize_row_q(src0_f32, dst_ptr + id, ne00);
|
|
|
|
|
id += dst_row_size;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
GGML_ASSERT(false); // TODO: implement
|
|
|
|
|
}
|
|
|
|
@ -5780,6 +5828,21 @@ static void ggml_compute_forward_dup_f32(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (dst->type == GGML_TYPE_Q4_0 || dst->type == GGML_TYPE_Q4_1) {
|
|
|
|
|
quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
|
|
|
|
|
size_t id = 0;
|
|
|
|
|
uint8_t * dst_ptr = (uint8_t *) dst->data;
|
|
|
|
|
size_t dst_row_size = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
|
|
|
|
|
|
|
|
|
|
for (int i03 = 0; i03 < ne03; i03++) {
|
|
|
|
|
for (int i02 = 0; i02 < ne02; i02++) {
|
|
|
|
|
for (int i01 = 0; i01 < ne01; i01++) {
|
|
|
|
|
const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
|
|
|
|
quantize_row_q(src0_ptr, dst_ptr + id, ne00);
|
|
|
|
|
id += dst_row_size;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
GGML_ASSERT(false); // TODO: implement
|
|
|
|
|
}
|
|
|
|
@ -5968,6 +6031,212 @@ static void ggml_compute_forward_add_f32(
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_compute_forward_add_f16_f32(
|
|
|
|
|
const struct ggml_compute_params * params,
|
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
|
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
|
|
|
|
|
|
|
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int ith = params->ith;
|
|
|
|
|
const int nth = params->nth;
|
|
|
|
|
|
|
|
|
|
const int n = ggml_nrows(src0);
|
|
|
|
|
const int nc = src0->ne[0];
|
|
|
|
|
|
|
|
|
|
const size_t nb00 = src0->nb[0];
|
|
|
|
|
const size_t nb01 = src0->nb[1];
|
|
|
|
|
|
|
|
|
|
const size_t nb10 = src1->nb[0];
|
|
|
|
|
const size_t nb11 = src1->nb[1];
|
|
|
|
|
|
|
|
|
|
const size_t nb0 = dst->nb[0];
|
|
|
|
|
const size_t nb1 = dst->nb[1];
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
|
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
|
|
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
|
|
|
|
|
|
|
|
if (nb10 == sizeof(float)) {
|
|
|
|
|
for (int j = ith; j < n; j += nth) {
|
|
|
|
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
|
|
|
|
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
|
|
|
|
for (int i = 0; i < nc; i++) {
|
|
|
|
|
float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
|
|
|
|
|
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// src1 is not contiguous
|
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_compute_forward_add_f16_f16(
|
|
|
|
|
const struct ggml_compute_params * params,
|
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
|
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
|
|
|
|
|
|
|
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int ith = params->ith;
|
|
|
|
|
const int nth = params->nth;
|
|
|
|
|
|
|
|
|
|
const int n = ggml_nrows(src0);
|
|
|
|
|
const int nc = src0->ne[0];
|
|
|
|
|
|
|
|
|
|
const size_t nb00 = src0->nb[0];
|
|
|
|
|
const size_t nb01 = src0->nb[1];
|
|
|
|
|
|
|
|
|
|
const size_t nb10 = src1->nb[0];
|
|
|
|
|
const size_t nb11 = src1->nb[1];
|
|
|
|
|
|
|
|
|
|
const size_t nb0 = dst->nb[0];
|
|
|
|
|
const size_t nb1 = dst->nb[1];
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
|
|
|
|
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
|
|
|
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
|
|
|
|
|
|
|
|
if (nb10 == sizeof(ggml_fp16_t)) {
|
|
|
|
|
for (int j = ith; j < n; j += nth) {
|
|
|
|
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
|
|
|
|
|
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
|
|
|
|
for (int i = 0; i < nc; i++) {
|
|
|
|
|
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
|
|
|
|
|
dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
// src1 is not contiguous
|
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_compute_forward_add_q_f32(
|
|
|
|
|
const struct ggml_compute_params * params,
|
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
|
const struct ggml_tensor * src1,
|
|
|
|
|
struct ggml_tensor * dst) {
|
|
|
|
|
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
|
|
|
|
|
|
|
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const int64_t ne00 = src0->ne[0];
|
|
|
|
|
const int64_t ne01 = src0->ne[1];
|
|
|
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
|
|
const int64_t ne03 = src0->ne[3];
|
|
|
|
|
|
|
|
|
|
//const int64_t ne10 = src1->ne[0];
|
|
|
|
|
//const int64_t ne11 = src1->ne[1];
|
|
|
|
|
const int64_t ne12 = src1->ne[2];
|
|
|
|
|
const int64_t ne13 = src1->ne[3];
|
|
|
|
|
|
|
|
|
|
//const int64_t ne0 = dst->ne[0];
|
|
|
|
|
//const int64_t ne1 = dst->ne[1];
|
|
|
|
|
const int64_t ne2 = dst->ne[2];
|
|
|
|
|
const int64_t ne3 = dst->ne[3];
|
|
|
|
|
|
|
|
|
|
const int nb00 = src0->nb[0];
|
|
|
|
|
const int nb01 = src0->nb[1];
|
|
|
|
|
const int nb02 = src0->nb[2];
|
|
|
|
|
const int nb03 = src0->nb[3];
|
|
|
|
|
|
|
|
|
|
const int nb10 = src1->nb[0];
|
|
|
|
|
const int nb11 = src1->nb[1];
|
|
|
|
|
const int nb12 = src1->nb[2];
|
|
|
|
|
const int nb13 = src1->nb[3];
|
|
|
|
|
|
|
|
|
|
const int nb0 = dst->nb[0];
|
|
|
|
|
const int nb1 = dst->nb[1];
|
|
|
|
|
const int nb2 = dst->nb[2];
|
|
|
|
|
const int nb3 = dst->nb[3];
|
|
|
|
|
|
|
|
|
|
const int ith = params->ith;
|
|
|
|
|
const int nth = params->nth;
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT(ne02 == ne12);
|
|
|
|
|
GGML_ASSERT(ne03 == ne13);
|
|
|
|
|
GGML_ASSERT(ne2 == ne12);
|
|
|
|
|
GGML_ASSERT(ne3 == ne13);
|
|
|
|
|
|
|
|
|
|
const enum ggml_type type = src0->type;
|
|
|
|
|
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
|
|
|
|
|
quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
|
|
|
|
|
|
|
|
|
|
// we don't support permuted src0 or src1
|
|
|
|
|
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
|
|
|
|
|
GGML_ASSERT(nb10 == sizeof(float));
|
|
|
|
|
|
|
|
|
|
// dst cannot be transposed or permuted
|
|
|
|
|
GGML_ASSERT(nb0 <= nb1);
|
|
|
|
|
GGML_ASSERT(nb1 <= nb2);
|
|
|
|
|
GGML_ASSERT(nb2 <= nb3);
|
|
|
|
|
|
|
|
|
|
GGML_ASSERT(src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1);
|
|
|
|
|
GGML_ASSERT(dst->type == src0->type);
|
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
|
|
|
|
|
|
|
|
// total rows in src0
|
|
|
|
|
const int nr = ne01*ne02*ne03;
|
|
|
|
|
|
|
|
|
|
// rows per thread
|
|
|
|
|
const int dr = (nr + nth - 1)/nth;
|
|
|
|
|
|
|
|
|
|
// row range for this thread
|
|
|
|
|
const int ir0 = dr*ith;
|
|
|
|
|
const int ir1 = MIN(ir0 + dr, nr);
|
|
|
|
|
|
|
|
|
|
float * wdata = (float*) params->wdata + ne00 * ith;
|
|
|
|
|
|
|
|
|
|
for (int ir = ir0; ir < ir1; ++ir) {
|
|
|
|
|
// src0 indices
|
|
|
|
|
const int i03 = ir/(ne02*ne01);
|
|
|
|
|
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
|
|
|
|
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
|
|
|
|
|
|
|
|
// src1 and dst are same shape as src0 => same indices
|
|
|
|
|
const int i13 = i03;
|
|
|
|
|
const int i12 = i02;
|
|
|
|
|
const int i11 = i01;
|
|
|
|
|
|
|
|
|
|
const int i3 = i03;
|
|
|
|
|
const int i2 = i02;
|
|
|
|
|
const int i1 = i01;
|
|
|
|
|
|
|
|
|
|
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
|
|
|
|
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
|
|
|
|
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
|
|
|
|
|
|
|
|
|
|
assert(ne00 % 32 == 0);
|
|
|
|
|
|
|
|
|
|
// unquantize row from src0 to temp buffer
|
|
|
|
|
dequantize_row_q(src0_row, wdata, ne00);
|
|
|
|
|
// add src1
|
|
|
|
|
ggml_vec_acc_f32(ne00, wdata, src1_row);
|
|
|
|
|
// quantize row to dst
|
|
|
|
|
quantize_row_q(wdata, dst_row, ne00);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_compute_forward_add(
|
|
|
|
|
const struct ggml_compute_params * params,
|
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
@ -5978,6 +6247,23 @@ static void ggml_compute_forward_add(
|
|
|
|
|
{
|
|
|
|
|
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_TYPE_F16:
|
|
|
|
|
{
|
|
|
|
|
if (src1->type == GGML_TYPE_F16) {
|
|
|
|
|
ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
|
|
|
|
|
}
|
|
|
|
|
else if (src1->type == GGML_TYPE_F32) {
|
|
|
|
|
ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
|
}
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_TYPE_Q4_0:
|
|
|
|
|
case GGML_TYPE_Q4_1:
|
|
|
|
|
{
|
|
|
|
|
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
|
|
|
|
|
} break;
|
|
|
|
|
default:
|
|
|
|
|
{
|
|
|
|
|
GGML_ASSERT(false);
|
|
|
|
@ -7257,30 +7543,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|
|
|
|
//}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|
|
|
|
[GGML_TYPE_Q4_0] = {
|
|
|
|
|
.dequantize_row_q = dequantize_row_q4_0,
|
|
|
|
|
.quantize_row_q = quantize_row_q4_0,
|
|
|
|
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
|
|
|
|
|
.quantize_row_q_dot = quantize_row_q8_0,
|
|
|
|
|
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
|
|
|
|
|
},
|
|
|
|
|
[GGML_TYPE_Q4_1] = {
|
|
|
|
|
.dequantize_row_q = dequantize_row_q4_1,
|
|
|
|
|
.quantize_row_q = quantize_row_q4_1,
|
|
|
|
|
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
|
|
|
|
.quantize_row_q_dot = quantize_row_q4_1,
|
|
|
|
|
.vec_dot_q = ggml_vec_dot_q4_1,
|
|
|
|
|
},
|
|
|
|
|
// TODO: GGML_TYPE_Q8_0
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// For internal test use
|
|
|
|
|
quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
|
|
|
GGML_ASSERT(i < GGML_TYPE_COUNT);
|
|
|
|
|
return quantize_fns[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_compute_forward_mul_mat_q_f32(
|
|
|
|
|
const struct ggml_compute_params * params,
|
|
|
|
|
const struct ggml_tensor * src0,
|
|
|
|
@ -10137,13 +10399,29 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
|
|
struct ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
|
|
|
|
|
|
switch (node->op) {
|
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
|
case GGML_OP_DUP:
|
|
|
|
|
{
|
|
|
|
|
node->n_tasks = 1;
|
|
|
|
|
|
|
|
|
|
size_t cur = 0;
|
|
|
|
|
if (node->type == GGML_TYPE_Q4_0 || node->type == GGML_TYPE_Q4_1) {
|
|
|
|
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
work_size = MAX(work_size, cur);
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_OP_ADD:
|
|
|
|
|
{
|
|
|
|
|
node->n_tasks = n_threads;
|
|
|
|
|
|
|
|
|
|
size_t cur = 0;
|
|
|
|
|
|
|
|
|
|
if (node->src0->type == GGML_TYPE_Q4_0 || node->src0->type == GGML_TYPE_Q4_1) {
|
|
|
|
|
cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
work_size = MAX(work_size, cur);
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_OP_SUB:
|
|
|
|
|
case GGML_OP_MUL:
|
|
|
|
@ -10224,7 +10502,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
|
|
|
{
|
|
|
|
|
node->n_tasks = n_threads;
|
|
|
|
|
} break;
|
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
|
case GGML_OP_CONT:
|
|
|
|
|
case GGML_OP_RESHAPE:
|
|
|
|
|
case GGML_OP_VIEW:
|
|
|
|
|