ollama-ollama资源-CSDN文库

共727个文件

go：185个

cu：146个

h：60个

需积分: 1 90 浏览量 2025-01-28 06:05:36 上传评论收藏 10.64MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

ollama-ollama （727个子文件）

setup.bmp 76KB

ggml-cpu-quants.c 479KB

ggml-cpu.c 471KB

ggml.c 247KB

ggml-quants.c 210KB

ggml-alloc.c 39KB

gpu_info_oneapi.c 8KB

gpu_info_nvcuda.c 8KB

gpu_info_cudart.c 6KB

gpu_info_nvml.c 3KB

llama.cpp 557KB

ggml-cpu-aarch64.cpp 240KB

unicode-data.cpp 166KB

clip.cpp 120KB

mmq.cpp 106KB

llama-model.cpp 102KB

llama-arch.cpp 83KB

llama-sampling.cpp 78KB

llama-vocab.cpp 77KB

ggml-backend.cpp 77KB

sgemm.cpp 69KB

common.cpp 68KB

llama-context.cpp 66KB

json-schema-to-grammar.cpp 44KB

llama-quant.cpp 44KB

llama-grammar.cpp 42KB

llama-model-loader.cpp 42KB

mllama.cpp 34KB

unicode.cpp 33KB

llama-kv-cache.cpp 28KB

llava.cpp 25KB

llama-chat.cpp 25KB

ggml-cpu.cpp 22KB

llama-mmap.cpp 19KB

sampling.cpp 19KB

ggml-blas.cpp 18KB

ggml-backend-reg.cpp 18KB

llama-batch.cpp 14KB

llama-adapter.cpp 13KB

log.cpp 11KB

amx.cpp 9KB

llama-impl.cpp 7KB

llama-hparams.cpp 3KB

ggml-cpu-traits.cpp 2KB

sampling_ext.cpp 2KB

ggml-threading.cpp 1KB

llama-cparams.cpp 1KB

build-info.cpp 169B

Makefile.cpu 2KB

app.css 518B

ggml-cuda.cu 129KB

convert.cu 26KB

cpy.cu 24KB

rope.cu 21KB

mmvq.cu 20KB

unary.cu 16KB

fattn.cu 15KB

binbcast.cu 14KB

fattn-tile-f16.cu 14KB

fattn-tile-f32.cu 14KB

mmv.cu 12KB

softmax.cu 9KB

concat.cu 9KB

norm.cu 8KB

getrows.cu 8KB

cross-entropy-loss.cu 7KB

quantize.cu 7KB

mmq.cu 6KB

im2col.cu 6KB

pad.cu 5KB

argsort.cu 5KB

conv-transpose-1d.cu 4KB

pool2d.cu 4KB

wkv6.cu 4KB

opt-step-adamw.cu 4KB

argmax.cu 4KB

upscale.cu 3KB

count-equal.cu 3KB

acc.cu 3KB

tsembd.cu 3KB

diagmask.cu 3KB

out-prod.cu 3KB

sum.cu 3KB

arange.cu 2KB

sumrows.cu 2KB

clamp.cu 2KB

scale.cu 2KB

fattn-wmma-f16-instance-kqfloat-cpb16.cu 2KB

fattn-wmma-f16-instance-kqhalf-cpb16.cu 2KB

fattn-wmma-f16-instance-kqhalf-cpb32.cu 2KB

fattn-wmma-f16-instance-kqfloat-cpb32.cu 2KB

fattn-wmma-f16-instance-kqhalf-cpb8.cu 1KB

fattn-vec-f16-instance-hs128-q5_0-q8_0.cu 1KB

fattn-vec-f16-instance-hs128-q8_0-q5_0.cu 1KB

fattn-vec-f16-instance-hs128-q5_0-q4_0.cu 1KB

fattn-vec-f32-instance-hs128-q5_0-q5_1.cu 1KB

fattn-vec-f16-instance-hs128-q5_0-q5_0.cu 1KB

fattn-vec-f16-instance-hs128-q4_0-q5_0.cu 1KB

fattn-vec-f32-instance-hs128-q4_1-q8_0.cu 1KB

fattn-vec-f32-instance-hs128-q4_0-q4_1.cu 1KB

共 727 条

/** * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file * * MIT License * * Copyright (c) 2023-2024 The ggml authors * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define GGML_COMMON_IMPL_C #include "ggml-common.h" #include "ggml-quants.h" #include "ggml-cpu-quants.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" #include "ggml-cpu.h" #include <math.h> #include <string.h> #include <assert.h> #include <float.h> #include <stdlib.h> // for qsort #include <stdio.h> // for GGML_ASSERT #define GROUP_MAX_EPS 1e-15f #define GROUP_MAX_EPS_IQ3_XXS 1e-8f #define GROUP_MAX_EPS_IQ2_S 1e-8f #define GROUP_MAX_EPS_IQ1_M 1e-7f #define GROUP_MAX_EPS_IQ1_S 1e-12f #if defined(_MSC_VER) // disable "possible loss of data" to avoid warnings for hundreds of casts // we should just be careful :) #pragma warning(disable: 4244 4267) #endif #define UNUSED GGML_UNUSED // some compilers don't provide _mm256_set_m128i, e.g. gcc 7 #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) // multiply int8_t, add results pairwise twice static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) { // Get absolute values of x vectors const __m128i ax = _mm_sign_epi8(x, x); // Sign the values of the y vectors const __m128i sy = _mm_sign_epi8(y, x); // Perform multiplication and create 16-bit values const __m128i dot = _mm_maddubs_epi16(ax, sy); const __m128i ones = _mm_set1_epi16(1); return _mm_madd_epi16(ones, dot); } #if __AVX__ || __AVX2__ || __AVX512F__ // horizontally add 8 floats static inline float hsum_float_8(const __m256 x) { __m128 res = _mm256_extractf128_ps(x, 1); res = _mm_add_ps(res, _mm256_castps256_ps128(x)); res = _mm_add_ps(res, _mm_movehl_ps(res, res)); res = _mm_add_ss(res, _mm_movehdup_ps(res)); return _mm_cvtss_f32(res); } // horizontally add 8 int32_t static inline int hsum_i32_8(const __m256i a) { const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1)); const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128); const __m128i sum64 = _mm_add_epi32(hi64, sum128); const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } // horizontally add 4 int32_t static inline int hsum_i32_4(const __m128i a) { const __m128i hi64 = _mm_unpackhi_epi64(a, a); const __m128i sum64 = _mm_add_epi32(hi64, a); const __m128i hi32 = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1)); return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } #if defined(__AVX2__) || defined(__AVX512F__) // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; memcpy(&x32, x, sizeof(uint32_t)); const __m256i shuf_mask = _mm256_set_epi64x( 0x0303030303030303, 0x0202020202020202, 0x0101010101010101, 0x0000000000000000); __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask); const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe); bytes = _mm256_or_si256(bytes, bit_mask); return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1)); } // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) { const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi); const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp); const __m256i lowMask = _mm256_set1_epi8( 0xF ); return _mm256_and_si256(lowMask, bytes); } // add int16_t pairwise and return as float vector static inline __m256 sum_i16_pairs_float(const __m256i x) { const __m256i ones = _mm256_set1_epi16(1); const __m256i summed_pairs = _mm256_madd_epi16(ones, x); return _mm256_cvtepi32_ps(summed_pairs); } static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { #if defined(__AVX512VNNI__) && defined(__AVX512VL__) const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); return _mm256_cvtepi32_ps(summed_pairs); #elif defined(__AVXVNNI__) const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy); return _mm256_cvtepi32_ps(summed_pairs); #else // Perform multiplication and create 16-bit values const __m256i dot = _mm256_maddubs_epi16(ax, sy); return sum_i16_pairs_float(dot); #endif } // multiply int8_t, add results pairwise twice and return as float vector static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { #if __AVXVNNIINT8__ const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); return _mm256_cvtepi32_ps(summed_pairs); #else // Get absolute values of x vectors const __m256i ax = _mm256_sign_epi8(x, x); // Sign the values of the y vectors const __m256i sy = _mm256_sign_epi8(y, x); return mul_sum_us8_pairs_float(ax, sy); #endif } static inline __m128i packNibbles( __m256i bytes ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh #if __AVX512F__ const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh return _mm256_cvtepi16_epi8(bytes); // abcd_efgh #else const __m256i lowByte = _mm256_set1_epi16( 0xFF ); __m256i high = _mm256_andnot_si256( lowByte, bytes ); __m256i low = _mm256_and_si256( lowByte, bytes ); high = _mm256_srli_epi16( high, 4 ); bytes = _mm256_or_si256( low, high ); // Compress uint16_t lanes into bytes __m128i r0 = _mm256_castsi256_si128( bytes ); __m128i r1 = _mm256_extracti128_si256( bytes, 1 ); return _mm_packus_epi16( r0, r1 ); #endif } #elif defined(__AVX__) static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) { // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh const __m128i lowByte = _mm_set1_epi16( 0xFF ); __m128i high = _mm_andnot_si128( lowByte, bytes1 ); __m128i low = _mm_and_si128( lowByte, bytes1 ); high = _mm_srli_epi16( high, 4 ); bytes1 = _mm_or_si128( low, high ); high = _mm_andnot_si128( lowByte, bytes2 ); low = _mm_and_si128( lowByte, bytes2 ); high = _mm_srli_epi16( high, 4 ); bytes2 = _mm_or_si128( low, high ); return _mm_packus_epi16( bytes1, bytes2); } static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) { const __m128i ax = _mm_sign_epi8(x, x); const __m128i sy = _mm_sign_epi8(y, x); return _mm_maddubs_epi16(ax, sy); } // spread 32 bits

评论收藏

内容反馈