2016年12月 6日星期二 11时58分53秒 CST

RC4 算法

RC4 是一种秘钥长度可变、面向字节操作的流密码。SSL/TLS协议、WEP、WPA中均有所应用

1. RC4 Cipher

RC4 通过生成伪随机位流作为keystream,

加密：该 keystream 与明文 plaintext 做异或操作从而得到密文 ciphertext;
解密：该 keystream 与密文 ciphertext 做异或操作从而得到明文 plaintext;

所以，RC4 的重点在于如何生成 keystream 呢？🤔

其实，to generate the keystream, the cipher makes use of a secret internal state which consists of two parts:

A permutation of all 256 possible bytes (对应下面的 S)
Two 8-bit index-pointers (对应下面的 i 和 j)

Anyway, 生成 keystream 只要有两部分：

Key Scheduling Algorithm (KSA): The permuation S 一开始会由一个长度可变的秘钥 key 进行初始化（key 的长度一般为40到2048 bits），然后使用 key-scheduling algorithm (KSA) 对 S 做转换，
Pseudo-Random Generation Algorithm (PRGA): 最后在使用伪随机数生成算法 (PRGA) 来生成 keystream.

1.1. Key-Scheduling Algorithm (KSA)

S: The key-scheduling algorithm is used initialize the permutation in the array S.
len(key): the number of bytes in the key, 长度范围可以为 [1, 256]，不过一般是 [5, 256].

算法可以表示如下：

var S [256]uint32

for i := 0; i < 256; i++ {
  S[i] = uint32(i)
}

var j uint8 = 0
for i := 0; i < 256; i++{
  j += S[i] + key[i%len(key)]
  S[i], S[j] = S[j], S[i]
}

1.2. Pseudo-Random Generation Algorithm (PRGA)

src: the plaintext or the ciphertext
keystream: the keystream

算法可以表示如下：

var i uint8 = 0
var j uint8 = 0
var keystream []byte
for k, v := range src {
  i += 1
  j += uint8(S[i])
  S[i], S[j] = S[j], S[i]
  keysream[k] = S[ uint8(S[i] + S[j]) ]
}

😁，有没有发现，其实上面的keystream可以作为随机数，因此也可以把基于 RC4 来实现 PRNG

2. RC4 cipher 的实现

go 语言中的 crypto/rc4 实现了rc4 cipher，代码如下：

import "strconv"

// A Cipher is an instance of RC4 using a particular key.
type Cipher struct {
  s [256]uint32
  i, j uint8
}

type KeySizeError int

func (k KeySizeError) Error() string {
  return "crypto/rc4: invalid key" + strconv.Itoa(int(k))
}

// NewCipher creates and returns a new Cipher. The argument should be the RC4 key,
// at least 1 byte and at most 256 bytes.
func NewCipher(key []byte) (*Cipher, error){
  k := len(key)
  if k<1 || k>256 {
    return nil, KeySizeError(k)
  }

  // Key-scheduling algorithm (KSA)
  // 1. initialize s as identity permutation
  var c Cipher
  for i:=0; i<256; i++ {
    c.s[i] = uint32(i)
  }
  // 2. mixes s with the bytes of the key
  var j uint8 = 0
  for i:=0; i<256; i++ {
    j += uint8(c.s[i]) + key[i%k]
    c.s[i], c.s[j] = c.s[j], c.s[i]
  }
  return &c, nil
}

// Reset zeros the key data so that it will no longer appear in the process's memory.
func (c *Cipher) Reset() {
  for i := range c.s {
    c.s[i] = 0
  }
  c.i, c.j = 0, 0
}

//xorKeyStreamGeneric set dst to the result of XORing src with the key stream.
// Dst and src may be the same slice but otherwise should not overlap
//
// This is the pure Go version. rc4_{amd64, 386, arm}* contain assembly implementations.
// This is here for tests and to prevent bitrot.
func (c *Cipher) xorKeyStreamGeneric(dst, src []byte) {
  i, j := c.i, c.j
  for k, v := range src {
    i += 1
    j += uint8(c.s[i])
    c.s[i], c.s[j] = c.s[j], c.s[i]
    dst[k] = v ^ uint8(c.s[uint8(c.s[i]+c.s[j])])
  }
  c.i, c.j = i, j
}

与汇编语言的接口如下：

// +build amd64 amd64p32 arm,!nacl 386

func xorKeyStream(dst, src *byte, n int, state *[256]uint32, i, j *uint8)

// XORKeyStream sets dst to the result of XORing src with the key stream.
// Dst and src may be the same slice but otherwise should not overlap.
func (c *Cipher) XORKeyStream(dst, src []byte) {
  if len(src) == 0 {
    return 
  }
  xorKeySteream(&dst[0], &src[0], len(src), &c.s, &c.i, &c.j)
}

对应 AMD64 处理器上的汇编语言的优化版实现如下：

// RC4 implementation optimized for AMD64 processors.
TEXT .xorKeyStream(SB),NOSPLIT, $0
  MOVQ n+16(FP), BX   // rbx = ARG(len)
  MOVQ src+8(FP), SI  // in = ARG(in)
  MOVQ dst+0(FP), DI  // out = ARG(out)
  MOVQ state+24(FP), BP
  MOVQ i+32(FP), AX
  MOVBQZX 0(AX), CX   // x = *xp
  MOVQ j+40(FP), AX
  MOVBQZX 0(AX), DX   // y = *yp

  LEAQ (SI)(BX*1), R9 //limit = in + len

l1:    CMPQ    SI,        R9        // cmp in with in+len
    JGE    finished              // jump if (in >= in+len)

    INCB    CX
    EXTEND(CX)
    TESTL    $15,        CX
    JZ    wordloop

    MOVBLZX    (BP)(CX*4),    AX

    ADDB    AX,        DX          // y += tx
    EXTEND(DX)
    MOVBLZX    (BP)(DX*4),    BX        // ty = d[y]
    MOVB    BX,        (BP)(CX*4)      // d[x] = ty
    ADDB    AX,        BX                // val = ty+tx
    EXTEND(BX)
    MOVB    AX,        (BP)(DX*4)      // d[y] = tx
    MOVBLZX    (BP)(BX*4),    R8        // val = d[val]
    XORB    (SI),        R8              // xor 1 byte
    MOVB    R8,        (DI)
    INCQ    SI                      // in++
    INCQ    DI                      // out++
    JMP l1

wordloop:
    SUBQ    $16,        R9
    CMPQ    SI,        R9
    JGT    end

start:
    ADDQ    $16,        SI        // increment in
    ADDQ    $16,        DI        // increment out


    // Each KEYROUND generates one byte of key and
    // inserts it into an XMM register at the given 16-bit index.
    // The key state array is uint32 words only using the bottom
    // byte of each word, so the 16-bit OR only copies 8 useful bits.
    // We accumulate alternating bytes into X0 and X1, and then at
    // the end we OR X1<<8 into X0 to produce the actual key.
    //
    // At the beginning of the loop, CX%16 == 0, so the 16 loads
    // at state[CX], state[CX+1], ..., state[CX+15] can precompute
    // (state+CX) as R12 and then become R12[0], R12[1], ... R12[15],
    // without fear of the byte computation CX+15 wrapping around.
    //
    // The first round needs R12[0], the second needs R12[1], and so on.
    // We can avoid memory stalls by starting the load for round n+1
    // before the end of round n, using the LOAD macro.
    LEAQ    (BP)(CX*4),    R12

#define KEYROUND(xmm, load, off, r1, r2, index) \
    MOVBLZX    (BP)(DX*4),    R8; \
    MOVB    r1,        (BP)(DX*4); \
    load((off+1), r2); \
    MOVB    R8,        (off*4)(R12); \
    ADDB    r1,        R8; \
    EXTEND(R8); \
    PINSRW    $index, (BP)(R8*4), xmm

#define LOAD(off, reg) \
    MOVBLZX    (off*4)(R12),    reg; \
    ADDB    reg,        DX; \
    EXTEND(DX)

#define SKIP(off, reg)

    LOAD(0, AX)
    KEYROUND(X0, LOAD, 0, AX, BX, 0)
    KEYROUND(X1, LOAD, 1, BX, AX, 0)
    KEYROUND(X0, LOAD, 2, AX, BX, 1)
    KEYROUND(X1, LOAD, 3, BX, AX, 1)
    KEYROUND(X0, LOAD, 4, AX, BX, 2)
    KEYROUND(X1, LOAD, 5, BX, AX, 2)
    KEYROUND(X0, LOAD, 6, AX, BX, 3)
    KEYROUND(X1, LOAD, 7, BX, AX, 3)
    KEYROUND(X0, LOAD, 8, AX, BX, 4)
    KEYROUND(X1, LOAD, 9, BX, AX, 4)
    KEYROUND(X0, LOAD, 10, AX, BX, 5)
    KEYROUND(X1, LOAD, 11, BX, AX, 5)
    KEYROUND(X0, LOAD, 12, AX, BX, 6)
    KEYROUND(X1, LOAD, 13, BX, AX, 6)
    KEYROUND(X0, LOAD, 14, AX, BX, 7)
    KEYROUND(X1, SKIP, 15, BX, AX, 7)

    ADDB    $16,        CX

    PSLLQ    $8,        X1
    PXOR    X1,        X0
    MOVOU    -16(SI),    X2
    PXOR    X0,        X2
    MOVOU    X2,        -16(DI)

    CMPQ    SI,        R9          // cmp in with in+len-16
    JLE    start                    // jump if (in <= in+len-16)

end:
    DECB    CX
    ADDQ    $16,        R9        // tmp = in+len

    // handle the last bytes, one by one
l2:    CMPQ    SI,        R9        // cmp in with in+len
    JGE    finished              // jump if (in >= in+len)

    INCB    CX
    EXTEND(CX)
    MOVBLZX    (BP)(CX*4),    AX

    ADDB    AX,        DX                // y += tx
    EXTEND(DX)
    MOVBLZX    (BP)(DX*4),    BX        // ty = d[y]
    MOVB    BX,        (BP)(CX*4)      // d[x] = ty
    ADDB    AX,        BX                // val = ty+tx
    EXTEND(BX)
    MOVB    AX,        (BP)(DX*4)      // d[y] = tx
    MOVBLZX    (BP)(BX*4),    R8        // val = d[val]
    XORB    (SI),        R8              // xor 1 byte
    MOVB    R8,        (DI)
    INCQ    SI                          // in++
    INCQ    DI                          // out++
    JMP l2

finished:
    MOVQ    j+40(FP),    BX
    MOVB    DX, 0(BX)
    MOVQ    i+32(FP),    AX
    MOVB    CX, 0(AX)
    RET

2.1. RC4 cipher 的使用

首先调用rc4包中的func NewCipher(key []byte) (*Cipher, error)新生成一个Cipher, 然后调用func (c *Cipher) XORKeyStream(dst, src []byte)进行加密或者解密

1.4.2 Revest Cipher 4 Algorithm