diff options
Diffstat (limited to 'tools/testing')
43 files changed, 5632 insertions, 541 deletions
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 33339e31e365..7f4527f897c4 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -3,6 +3,7 @@ CFLAGS = -Wall CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE +TEST_PROGS := binfmt_script TEST_GEN_PROGS := execveat TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test diff --git a/tools/testing/selftests/exec/binfmt_script b/tools/testing/selftests/exec/binfmt_script new file mode 100755 index 000000000000..05f94a741c7a --- /dev/null +++ b/tools/testing/selftests/exec/binfmt_script @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +# +# Test that truncation of bprm->buf doesn't cause unexpected execs paths, along +# with various other pathological cases. +import os, subprocess + +# Relevant commits +# +# b5372fe5dc84 ("exec: load_script: Do not exec truncated interpreter path") +# 6eb3c3d0a52d ("exec: increase BINPRM_BUF_SIZE to 256") + +# BINPRM_BUF_SIZE +SIZE=256 + +NAME_MAX=int(subprocess.check_output(["getconf", "NAME_MAX", "."])) + +test_num=0 + +code='''#!/usr/bin/perl +print "Executed interpreter! Args:\n"; +print "0 : '$0'\n"; +$counter = 1; +foreach my $a (@ARGV) { + print "$counter : '$a'\n"; + $counter++; +} +''' + +## +# test - produce a binfmt_script hashbang line for testing +# +# @size: bytes for bprm->buf line, including hashbang but not newline +# @good: whether this script is expected to execute correctly +# @hashbang: the special 2 bytes for running binfmt_script +# @leading: any leading whitespace before the executable path +# @root: start of executable pathname +# @target: end of executable pathname +# @arg: bytes following the executable pathname +# @fill: character to fill between @root and @target to reach @size bytes +# @newline: character to use as newline, not counted towards @size +# ... +def test(name, size, good=True, leading="", root="./", target="/perl", + fill="A", arg="", newline="\n", hashbang="#!"): + global test_num, tests, NAME_MAX + test_num += 1 + if test_num > tests: + raise ValueError("more binfmt_script tests than expected! (want %d, expected %d)" + % (test_num, tests)) + + middle = "" + remaining = size - len(hashbang) - len(leading) - len(root) - len(target) - len(arg) + # The middle of the pathname must not exceed NAME_MAX + while remaining >= NAME_MAX: + middle += fill * (NAME_MAX - 1) + middle += '/' + remaining -= NAME_MAX + middle += fill * remaining + + dirpath = root + middle + binary = dirpath + target + if len(target): + os.makedirs(dirpath, mode=0o755, exist_ok=True) + open(binary, "w").write(code) + os.chmod(binary, 0o755) + + buf=hashbang + leading + root + middle + target + arg + newline + if len(newline) > 0: + buf += 'echo this is not really perl\n' + + script = "binfmt_script-%s" % (name) + open(script, "w").write(buf) + os.chmod(script, 0o755) + + proc = subprocess.Popen(["./%s" % (script)], shell=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout = proc.communicate()[0] + + if proc.returncode == 0 and b'Executed interpreter' in stdout: + if good: + print("ok %d - binfmt_script %s (successful good exec)" + % (test_num, name)) + else: + print("not ok %d - binfmt_script %s succeeded when it should have failed" + % (test_num, name)) + else: + if good: + print("not ok %d - binfmt_script %s failed when it should have succeeded (rc:%d)" + % (test_num, name, proc.returncode)) + else: + print("ok %d - binfmt_script %s (correctly failed bad exec)" + % (test_num, name)) + + # Clean up crazy binaries + os.unlink(script) + if len(target): + elements = binary.split('/') + os.unlink(binary) + elements.pop() + while len(elements) > 1: + os.rmdir("/".join(elements)) + elements.pop() + +tests=27 +print("TAP version 1.3") +print("1..%d" % (tests)) + +### FAIL (8 tests) + +# Entire path is well past the BINFMT_BUF_SIZE. +test(name="too-big", size=SIZE+80, good=False) +# Path is right at max size, making it impossible to tell if it was truncated. +test(name="exact", size=SIZE, good=False) +# Same as above, but with leading whitespace. +test(name="exact-space", size=SIZE, good=False, leading=" ") +# Huge buffer of only whitespace. +test(name="whitespace-too-big", size=SIZE+71, good=False, root="", + fill=" ", target="") +# A good path, but it gets truncated due to leading whitespace. +test(name="truncated", size=SIZE+17, good=False, leading=" " * 19) +# Entirely empty except for #! +test(name="empty", size=2, good=False, root="", + fill="", target="", newline="") +# Within size, but entirely spaces +test(name="spaces", size=SIZE-1, good=False, root="", fill=" ", + target="", newline="") +# Newline before binary. +test(name="newline-prefix", size=SIZE-1, good=False, leading="\n", + root="", fill=" ", target="") + +### ok (19 tests) + +# The original test case that was broken by commit: +# 8099b047ecc4 ("exec: load_script: don't blindly truncate shebang string") +test(name="test.pl", size=439, leading=" ", + root="./nix/store/bwav8kz8b3y471wjsybgzw84mrh4js9-perl-5.28.1/bin", + arg=" -I/nix/store/x6yyav38jgr924nkna62q3pkp0dgmzlx-perl5.28.1-File-Slurp-9999.25/lib/perl5/site_perl -I/nix/store/ha8v67sl8dac92r9z07vzr4gv1y9nwqz-perl5.28.1-Net-DBus-1.1.0/lib/perl5/site_perl -I/nix/store/dcrkvnjmwh69ljsvpbdjjdnqgwx90a9d-perl5.28.1-XML-Parser-2.44/lib/perl5/site_perl -I/nix/store/rmji88k2zz7h4zg97385bygcydrf2q8h-perl5.28.1-XML-Twig-3.52/lib/perl5/site_perl") +# One byte under size, leaving newline visible. +test(name="one-under", size=SIZE-1) +# Two bytes under size, leaving newline visible. +test(name="two-under", size=SIZE-2) +# Exact size, but trailing whitespace visible instead of newline +test(name="exact-trunc-whitespace", size=SIZE, arg=" ") +# Exact size, but trailing space and first arg char visible instead of newline. +test(name="exact-trunc-arg", size=SIZE, arg=" f") +# One bute under, with confirmed non-truncated arg since newline now visible. +test(name="one-under-full-arg", size=SIZE-1, arg=" f") +# Short read buffer by one byte. +test(name="one-under-no-nl", size=SIZE-1, newline="") +# Short read buffer by half buffer size. +test(name="half-under-no-nl", size=int(SIZE/2), newline="") +# One byte under with whitespace arg. leaving wenline visible. +test(name="one-under-trunc-arg", size=SIZE-1, arg=" ") +# One byte under with whitespace leading. leaving wenline visible. +test(name="one-under-leading", size=SIZE-1, leading=" ") +# One byte under with whitespace leading and as arg. leaving newline visible. +test(name="one-under-leading-trunc-arg", size=SIZE-1, leading=" ", arg=" ") +# Same as above, but with 2 bytes under +test(name="two-under-no-nl", size=SIZE-2, newline="") +test(name="two-under-trunc-arg", size=SIZE-2, arg=" ") +test(name="two-under-leading", size=SIZE-2, leading=" ") +test(name="two-under-leading-trunc-arg", size=SIZE-2, leading=" ", arg=" ") +# Same as above, but with buffer half filled +test(name="two-under-no-nl", size=int(SIZE/2), newline="") +test(name="two-under-trunc-arg", size=int(SIZE/2), arg=" ") +test(name="two-under-leading", size=int(SIZE/2), leading=" ") +test(name="two-under-lead-trunc-arg", size=int(SIZE/2), leading=" ", arg=" ") + +if test_num != tests: + raise ValueError("fewer binfmt_script tests than expected! (ran %d, expected %d" + % (test_num, tests)) diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index 14a77ea4a8da..b80ee3f6e265 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -2,3 +2,4 @@ CONFIG_TEST_PRINTF=m CONFIG_TEST_BITMAP=m CONFIG_PRIME_NUMBERS=m CONFIG_TEST_STRSCPY=m +CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c index 6dee9e636a95..422e7761254d 100644 --- a/tools/testing/selftests/net/rxtimestamp.c +++ b/tools/testing/selftests/net/rxtimestamp.c @@ -115,6 +115,7 @@ static struct option long_options[] = { { "tcp", no_argument, 0, 't' }, { "udp", no_argument, 0, 'u' }, { "ip", no_argument, 0, 'i' }, + { NULL, 0, NULL, 0 }, }; static int next_port = 19999; diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c index aca3491174a1..f4bb4fef0f39 100644 --- a/tools/testing/selftests/net/timestamping.c +++ b/tools/testing/selftests/net/timestamping.c @@ -313,10 +313,16 @@ int main(int argc, char **argv) int val; socklen_t len; struct timeval next; + size_t if_len; if (argc < 2) usage(0); interface = argv[1]; + if_len = strlen(interface); + if (if_len >= IFNAMSIZ) { + printf("interface name exceeds IFNAMSIZ\n"); + exit(1); + } for (i = 2; i < argc; i++) { if (!strcasecmp(argv[i], "SO_TIMESTAMP")) @@ -350,12 +356,12 @@ int main(int argc, char **argv) bail("socket"); memset(&device, 0, sizeof(device)); - strncpy(device.ifr_name, interface, sizeof(device.ifr_name)); + memcpy(device.ifr_name, interface, if_len + 1); if (ioctl(sock, SIOCGIFADDR, &device) < 0) bail("getting interface IP address"); memset(&hwtstamp, 0, sizeof(hwtstamp)); - strncpy(hwtstamp.ifr_name, interface, sizeof(hwtstamp.ifr_name)); + memcpy(hwtstamp.ifr_name, interface, if_len + 1); hwtstamp.ifr_data = (void *)&hwconfig; memset(&hwconfig, 0, sizeof(hwconfig)); hwconfig.tx_type = diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index c5282e62df75..b599f1fa99b5 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -213,6 +213,64 @@ TEST_F(tls, send_then_sendfile) EXPECT_EQ(recv(self->cfd, buf, st.st_size, MSG_WAITALL), st.st_size); } +static void chunked_sendfile(struct __test_metadata *_metadata, + struct _test_data_tls *self, + uint16_t chunk_size, + uint16_t extra_payload_size) +{ + char buf[TLS_PAYLOAD_MAX_LEN]; + uint16_t test_payload_size; + int size = 0; + int ret; + char filename[] = "/tmp/mytemp.XXXXXX"; + int fd = mkstemp(filename); + off_t offset = 0; + + unlink(filename); + ASSERT_GE(fd, 0); + EXPECT_GE(chunk_size, 1); + test_payload_size = chunk_size + extra_payload_size; + ASSERT_GE(TLS_PAYLOAD_MAX_LEN, test_payload_size); + memset(buf, 1, test_payload_size); + size = write(fd, buf, test_payload_size); + EXPECT_EQ(size, test_payload_size); + fsync(fd); + + while (size > 0) { + ret = sendfile(self->fd, fd, &offset, chunk_size); + EXPECT_GE(ret, 0); + size -= ret; + } + + EXPECT_EQ(recv(self->cfd, buf, test_payload_size, MSG_WAITALL), + test_payload_size); + + close(fd); +} + +TEST_F(tls, multi_chunk_sendfile) +{ + chunked_sendfile(_metadata, self, 4096, 4096); + chunked_sendfile(_metadata, self, 4096, 0); + chunked_sendfile(_metadata, self, 4096, 1); + chunked_sendfile(_metadata, self, 4096, 2048); + chunked_sendfile(_metadata, self, 8192, 2048); + chunked_sendfile(_metadata, self, 4096, 8192); + chunked_sendfile(_metadata, self, 8192, 4096); + chunked_sendfile(_metadata, self, 12288, 1024); + chunked_sendfile(_metadata, self, 12288, 2000); + chunked_sendfile(_metadata, self, 15360, 100); + chunked_sendfile(_metadata, self, 15360, 300); + chunked_sendfile(_metadata, self, 1, 4096); + chunked_sendfile(_metadata, self, 2048, 4096); + chunked_sendfile(_metadata, self, 2048, 8192); + chunked_sendfile(_metadata, self, 4096, 8192); + chunked_sendfile(_metadata, self, 1024, 12288); + chunked_sendfile(_metadata, self, 2000, 12288); + chunked_sendfile(_metadata, self, 100, 15360); + chunked_sendfile(_metadata, self, 300, 15360); +} + TEST_F(tls, recv_max) { unsigned int send_len = TLS_PAYLOAD_MAX_LEN; diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh index 9c60337317c6..020137b61407 100755 --- a/tools/testing/selftests/ntb/ntb_test.sh +++ b/tools/testing/selftests/ntb/ntb_test.sh @@ -241,7 +241,7 @@ function get_files_count() split_remote $LOC if [[ "$REMOTE" == "" ]]; then - echo $(ls -1 "$LOC"/${NAME}* 2>/dev/null | wc -l) + echo $(ls -1 "$VPATH"/${NAME}* 2>/dev/null | wc -l) else echo $(ssh "$REMOTE" "ls -1 \"$VPATH\"/${NAME}* | \ wc -l" 2> /dev/null) diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 644770c3b754..0830e63818c1 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -19,6 +19,7 @@ SUB_DIRS = alignment \ copyloops \ dscr \ mm \ + nx-gzip \ pmu \ signal \ primitives \ diff --git a/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules new file mode 100644 index 000000000000..5a7118495cb3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules @@ -0,0 +1 @@ +SUBSYSTEM=="nxgzip", KERNEL=="nx-gzip", MODE="0666" diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile new file mode 100644 index 000000000000..640fad6cc2c7 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -0,0 +1,8 @@ +CFLAGS = -O3 -m64 -I./include + +TEST_GEN_FILES := gzfht_test gunz_test +TEST_PROGS := nx-gzip-test.sh + +include ../../lib.mk + +$(TEST_GEN_FILES): gzip_vas.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/README b/tools/testing/selftests/powerpc/nx-gzip/README new file mode 100644 index 000000000000..9809dbaa1905 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/README @@ -0,0 +1,45 @@ +Test the nx-gzip function: +========================= + +Verify that following device exists: + /dev/crypto/nx-gzip +If you get a permission error run as sudo or set the device permissions: + sudo chmod go+rw /dev/crypto/nx-gzip +However, chmod may not survive across boots. You may create a udev file such +as: + /etc/udev/rules.d/99-nx-gzip.rules + + +To manually build and run: +$ gcc -O3 -I./include -o gzfht_test gzfht_test.c gzip_vas.c +$ gcc -O3 -I./include -o gunz_test gunz_test.c gzip_vas.c + + +Compress any file using Fixed Huffman mode. Output will have a .nx.gz suffix: +$ ./gzfht_test gzip_vas.c +file gzip_vas.c read, 6413 bytes +compressed 6413 to 3124 bytes total, crc32 checksum = abd15e8a + + +Uncompress the previous output. Output will have a .nx.gunzip suffix: +./gunz_test gzip_vas.c.nx.gz +gzHeader FLG 0 +00 00 00 00 04 03 +gzHeader MTIME, XFL, OS ignored +computed checksum abd15e8a isize 0000190d +stored checksum abd15e8a isize 0000190d +decomp is complete: fclose + + +Compare two files: +$ sha1sum gzip_vas.c.nx.gz.nx.gunzip gzip_vas.c +bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c.nx.gz.nx.gunzip +bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c + + +Note that the code here are intended for testing the nx-gzip hardware function. +They are not intended for demonstrating performance or compression ratio. +By being simplistic these selftests expect to allocate the entire set of source +and target pages in the memory so it needs enough memory to work. +For more information and source code consider using: +https://github.com/libnxz/power-gzip diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c new file mode 100644 index 000000000000..6ee0fded0391 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c @@ -0,0 +1,1028 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* P9 gunzip sample code for demonstrating the P9 NX hardware + * interface. Not intended for productive uses or for performance or + * compression ratio measurements. Note also that /dev/crypto/gzip, + * VAS and skiboot support are required + * + * Copyright 2020 IBM Corp. + * + * Author: Bulent Abali <abali@us.ibm.com> + * + * https://github.com/libnxz/power-gzip for zlib api and other utils + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +#define _ISOC11_SOURCE // For aligned_alloc() +#define _DEFAULT_SOURCE // For endian.h + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "nxu.h" +#include "nx.h" +#include "crb.h" + +int nx_dbg; +FILE *nx_gzip_log; + +#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y)) +#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y)) + +#define GETINPC(X) fgetc(X) +#define FNAME_MAX 1024 + +/* fifo queue management */ +#define fifo_used_bytes(used) (used) +#define fifo_free_bytes(used, len) ((len)-(used)) +/* amount of free bytes in the first and last parts */ +#define fifo_free_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (len)-((cur)+(used)) : 0) +#define fifo_free_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (cur) : (len)-(used)) +/* amount of used bytes in the first and last parts */ +#define fifo_used_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? (used) : (len)-(cur)) +#define fifo_used_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \ + ? 0 : ((used)+(cur))-(len)) +/* first and last free parts start here */ +#define fifo_free_first_offset(cur, used) ((cur)+(used)) +#define fifo_free_last_offset(cur, used, len) \ + fifo_used_last_bytes(cur, used, len) +/* first and last used parts start here */ +#define fifo_used_first_offset(cur) (cur) +#define fifo_used_last_offset(cur) (0) + +const int fifo_in_len = 1<<24; +const int fifo_out_len = 1<<24; +const int page_sz = 1<<16; +const int line_sz = 1<<7; +const int window_max = 1<<15; + +/* + * Adds an (address, len) pair to the list of ddes (ddl) and updates + * the base dde. ddl[0] is the only dde in a direct dde which + * contains a single (addr,len) pair. For more pairs, ddl[0] becomes + * the indirect (base) dde that points to a list of direct ddes. + * See Section 6.4 of the NX-gzip user manual for DDE description. + * Addr=NULL, len=0 clears the ddl[0]. Returns the total number of + * bytes in ddl. Caller is responsible for allocting the array of + * nx_dde_t *ddl. If N addresses are required in the scatter-gather + * list, the ddl array must have N+1 entries minimum. + */ +static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr, + uint32_t len) +{ + uint32_t ddecnt; + uint32_t bytes; + + if (addr == NULL && len == 0) { + clearp_dde(ddl); + return 0; + } + + NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr, + __func__, len)); + + /* Number of ddes in the dde list ; == 0 when it is a direct dde */ + ddecnt = getpnn(ddl, dde_count); + bytes = getp32(ddl, ddebc); + + if (ddecnt == 0 && bytes == 0) { + /* First dde is unused; make it a direct dde */ + bytes = len; + putp32(ddl, ddebc, bytes); + putp64(ddl, ddead, (uint64_t) addr); + } else if (ddecnt == 0) { + /* Converting direct to indirect dde + * ddl[0] becomes head dde of ddl + * copy direct to indirect first. + */ + ddl[1] = ddl[0]; + + /* Add the new dde next */ + clear_dde(ddl[2]); + put32(ddl[2], ddebc, len); + put64(ddl[2], ddead, (uint64_t) addr); + + /* Ddl head points to 2 direct ddes */ + ddecnt = 2; + putpnn(ddl, dde_count, ddecnt); + bytes = bytes + len; + putp32(ddl, ddebc, bytes); + /* Pointer to the first direct dde */ + putp64(ddl, ddead, (uint64_t) &ddl[1]); + } else { + /* Append a dde to an existing indirect ddl */ + ++ddecnt; + clear_dde(ddl[ddecnt]); + put64(ddl[ddecnt], ddead, (uint64_t) addr); + put32(ddl[ddecnt], ddebc, len); + + putpnn(ddl, dde_count, ddecnt); + bytes = bytes + len; + putp32(ddl, ddebc, bytes); /* byte sum of all dde */ + } + return bytes; +} + +/* + * Touch specified number of pages represented in number bytes + * beginning from the first buffer in a dde list. + * Do not touch the pages past buf_sz-th byte's page. + * + * Set buf_sz = 0 to touch all pages described by the ddep. + */ +static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz, + int wr) +{ + uint32_t indirect_count; + uint32_t buf_len; + long total; + uint64_t buf_addr; + struct nx_dde_t *dde_list; + int i; + + assert(!!ddep); + + indirect_count = getpnn(ddep, dde_count); + + NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__, + indirect_count)); + NXPRT(fprintf(stderr, "0x%lx\n", buf_sz)); + + if (indirect_count == 0) { + /* Direct dde */ + buf_len = getp32(ddep, ddebc); + buf_addr = getp64(ddep, ddead); + + NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n", + buf_len, (void *)buf_addr)); + + if (buf_sz == 0) + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + else + nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len, + buf_sz), page_sz, wr); + + return ERR_NX_OK; + } + + /* Indirect dde */ + if (indirect_count > MAX_DDE_COUNT) + return ERR_NX_EXCESSIVE_DDE; + + /* First address of the list */ + dde_list = (struct nx_dde_t *) getp64(ddep, ddead); + + if (buf_sz == 0) + buf_sz = getp32(ddep, ddebc); + + total = 0; + for (i = 0; i < indirect_count; i++) { + buf_len = get32(dde_list[i], ddebc); + buf_addr = get64(dde_list[i], ddead); + total += buf_len; + + NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ", + buf_len, (void *)buf_addr)); + NXPRT(fprintf(stderr, "0x%lx\n", total)); + + /* Touching fewer pages than encoded in the ddebc */ + if (total > buf_sz) { + buf_len = NX_MIN(buf_len, total - buf_sz); + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + NXPRT(fprintf(stderr, "touch loop break len 0x%x ", + buf_len)); + NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr)); + break; + } + nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr); + } + return ERR_NX_OK; +} + +/* + * Src and dst buffers are supplied in scatter gather lists. + * NX function code and other parameters supplied in cmdp. + */ +static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst, + struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + uint64_t csbaddr; + + memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); + + cmdp->crb.source_dde = *src; + cmdp->crb.target_dde = *dst; + + /* Status, output byte count in tpbc */ + csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask; + put64(cmdp->crb, csb_address, csbaddr); + + /* NX reports input bytes in spbc; cleared */ + cmdp->cpb.out_spbc_comp_wrap = 0; + cmdp->cpb.out_spbc_comp_with_count = 0; + cmdp->cpb.out_spbc_decomp = 0; + + /* Clear output */ + put32(cmdp->cpb, out_crc, INIT_CRC); + put32(cmdp->cpb, out_adler, INIT_ADLER); + + /* Submit the crb, the job descriptor, to the accelerator. */ + return nxu_submit_job(cmdp, handle); +} + +int decompress_file(int argc, char **argv, void *devhandle) +{ + FILE *inpf = NULL; + FILE *outf = NULL; + + int c, expect, i, cc, rc = 0; + char gzfname[FNAME_MAX]; + + /* Queuing, file ops, byte counting */ + char *fifo_in, *fifo_out; + int used_in, cur_in, used_out, cur_out, read_sz, n; + int first_free, last_free, first_used, last_used; + int first_offset, last_offset; + int write_sz, free_space, source_sz; + int source_sz_estimate, target_sz_estimate; + uint64_t last_comp_ratio = 0; /* 1000 max */ + uint64_t total_out = 0; + int is_final, is_eof; + + /* nx hardware */ + int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0; + int history_len = 0; + struct nx_gzip_crb_cpb_t cmd, *cmdp; + struct nx_dde_t *ddl_in; + struct nx_dde_t dde_in[6] __aligned(128); + struct nx_dde_t *ddl_out; + struct nx_dde_t dde_out[6] __aligned(128); + int pgfault_retries; + + /* when using mmap'ed files */ + off_t input_file_offset; + + if (argc > 2) { + fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]); + fprintf(stderr, " writes to stdout or <fname>.nx.gunzip\n"); + return -1; + } + + if (argc == 1) { + inpf = stdin; + outf = stdout; + } else if (argc == 2) { + char w[1024]; + char *wp; + + inpf = fopen(argv[1], "r"); + if (inpf == NULL) { + perror(argv[1]); + return -1; + } + + /* Make a new file name to write to. Ignoring '.gz' */ + wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1]; + strcpy(w, wp); + strcat(w, ".nx.gunzip"); + + outf = fopen(w, "w"); + if (outf == NULL) { + perror(w); + return -1; + } + } + + /* Decode the gzip header */ + c = GETINPC(inpf); expect = 0x1f; /* ID1 */ + if (c != expect) + goto err1; + + c = GETINPC(inpf); expect = 0x8b; /* ID2 */ + if (c != expect) + goto err1; + + c = GETINPC(inpf); expect = 0x08; /* CM */ + if (c != expect) + goto err1; + + int flg = GETINPC(inpf); /* FLG */ + + if (flg & 0xE0 || flg & 0x4 || flg == EOF) + goto err2; + + fprintf(stderr, "gzHeader FLG %x\n", flg); + + /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this + * sample code. + */ + for (i = 0; i < 6; i++) { + char tmp[10]; + + tmp[i] = GETINPC(inpf); + if (tmp[i] == EOF) + goto err3; + fprintf(stderr, "%02x ", tmp[i]); + if (i == 5) + fprintf(stderr, "\n"); + } + fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n"); + + /* FNAME */ + if (flg & 0x8) { + int k = 0; + + do { + c = GETINPC(inpf); + if (c == EOF || k >= FNAME_MAX) + goto err3; + gzfname[k++] = c; + } while (c); + fprintf(stderr, "gzHeader FNAME: %s\n", gzfname); + } + + /* FHCRC */ + if (flg & 0x2) { + c = GETINPC(inpf); + if (c == EOF) + goto err3; + c = GETINPC(inpf); + if (c == EOF) + goto err3; + fprintf(stderr, "gzHeader FHCRC: ignored\n"); + } + + used_in = cur_in = used_out = cur_out = 0; + is_final = is_eof = 0; + + /* Allocate one page larger to prevent page faults due to NX + * overfetching. + * Either do this (char*)(uintptr_t)aligned_alloc or use + * -std=c11 flag to make the int-to-pointer warning go away. + */ + assert((fifo_in = (char *)(uintptr_t)aligned_alloc(line_sz, + fifo_in_len + page_sz)) != NULL); + assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz, + fifo_out_len + page_sz + line_sz)) != NULL); + /* Leave unused space due to history rounding rules */ + fifo_out = fifo_out + line_sz; + nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1); + + ddl_in = &dde_in[0]; + ddl_out = &dde_out[0]; + cmdp = &cmd; + memset(&cmdp->crb, 0, sizeof(cmdp->crb)); + +read_state: + + /* Read from .gz file */ + + NXPRT(fprintf(stderr, "read_state:\n")); + + if (is_eof != 0) + goto write_state; + + /* We read in to fifo_in in two steps: first: read in to from + * cur_in to the end of the buffer. last: if free space wrapped + * around, read from fifo_in offset 0 to offset cur_in. + */ + + /* Reset fifo head to reduce unnecessary wrap arounds */ + cur_in = (used_in == 0) ? 0 : cur_in; + + /* Free space total is reduced by a gap */ + free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len) + - line_sz); + + /* Free space may wrap around as first and last */ + first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len); + last_free = fifo_free_last_bytes(cur_in, used_in, fifo_in_len); + + /* Start offsets of the free memory */ + first_offset = fifo_free_first_offset(cur_in, used_in); + last_offset = fifo_free_last_offset(cur_in, used_in, fifo_in_len); + + /* Reduce read_sz because of the line_sz gap */ + read_sz = NX_MIN(free_space, first_free); + n = 0; + if (read_sz > 0) { + /* Read in to offset cur_in + used_in */ + n = fread(fifo_in + first_offset, 1, read_sz, inpf); + used_in = used_in + n; + free_space = free_space - n; + assert(n <= read_sz); + if (n != read_sz) { + /* Either EOF or error; exit the read loop */ + is_eof = 1; + goto write_state; + } + } + + /* If free space wrapped around */ + if (last_free > 0) { + /* Reduce read_sz because of the line_sz gap */ + read_sz = NX_MIN(free_space, last_free); + n = 0; + if (read_sz > 0) { + n = fread(fifo_in + last_offset, 1, read_sz, inpf); + used_in = used_in + n; /* Increase used space */ + free_space = free_space - n; /* Decrease free space */ + assert(n <= read_sz); + if (n != read_sz) { + /* Either EOF or error; exit the read loop */ + is_eof = 1; + goto write_state; + } + } + } + + /* At this point we have used_in bytes in fifo_in with the + * data head starting at cur_in and possibly wrapping around. + */ + +write_state: + + /* Write decompressed data to output file */ + + NXPRT(fprintf(stderr, "write_state:\n")); + + if (used_out == 0) + goto decomp_state; + + /* If fifo_out has data waiting, write it out to the file to + * make free target space for the accelerator used bytes in + * the first and last parts of fifo_out. + */ + + first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len); + last_used = fifo_used_last_bytes(cur_out, used_out, fifo_out_len); + + write_sz = first_used; + + n = 0; + if (write_sz > 0) { + n = fwrite(fifo_out + cur_out, 1, write_sz, outf); + used_out = used_out - n; + /* Move head of the fifo */ + cur_out = (cur_out + n) % fifo_out_len; + assert(n <= write_sz); + if (n != write_sz) { + fprintf(stderr, "error: write\n"); + rc = -1; + goto err5; + } + } + + if (last_used > 0) { /* If more data available in the last part */ + write_sz = last_used; /* Keep it here for later */ + n = 0; + if (write_sz > 0) { + n = fwrite(fifo_out, 1, write_sz, outf); + used_out = used_out - n; + cur_out = (cur_out + n) % fifo_out_len; + assert(n <= write_sz); + if (n != write_sz) { + fprintf(stderr, "error: write\n"); + rc = -1; + goto err5; + } + } + } + +decomp_state: + + /* NX decompresses input data */ + + NXPRT(fprintf(stderr, "decomp_state:\n")); + + if (is_final) + goto finish_state; + + /* Address/len lists */ + clearp_dde(ddl_in); + clearp_dde(ddl_out); + + /* FC, CRC, HistLen, Table 6-6 */ + if (resuming) { + /* Resuming a partially decompressed input. + * The key to resume is supplying the 32KB + * dictionary (history) to NX, which is basically + * the last 32KB of output produced. + */ + fc = GZIP_FC_DECOMPRESS_RESUME; + + cmdp->cpb.in_crc = cmdp->cpb.out_crc; + cmdp->cpb.in_adler = cmdp->cpb.out_adler; + + /* Round up the history size to quadword. Section 2.10 */ + history_len = (history_len + 15) / 16; + putnn(cmdp->cpb, in_histlen, history_len); + history_len = history_len * 16; /* bytes */ + + if (history_len > 0) { + /* Chain in the history buffer to the DDE list */ + if (cur_out >= history_len) { + nx_append_dde(ddl_in, fifo_out + + (cur_out - history_len), + history_len); + } else { + nx_append_dde(ddl_in, fifo_out + + ((fifo_out_len + cur_out) + - history_len), + history_len - cur_out); + /* Up to 32KB history wraps around fifo_out */ + nx_append_dde(ddl_in, fifo_out, cur_out); + } + + } + } else { + /* First decompress job */ + fc = GZIP_FC_DECOMPRESS; + + history_len = 0; + /* Writing 0 clears out subc as well */ + cmdp->cpb.in_histlen = 0; + total_out = 0; + + put32(cmdp->cpb, in_crc, INIT_CRC); + put32(cmdp->cpb, in_adler, INIT_ADLER); + put32(cmdp->cpb, out_crc, INIT_CRC); + put32(cmdp->cpb, out_adler, INIT_ADLER); + + /* Assuming 10% compression ratio initially; use the + * most recently measured compression ratio as a + * heuristic to estimate the input and output + * sizes. If we give too much input, the target buffer + * overflows and NX cycles are wasted, and then we + * must retry with smaller input size. 1000 is 100%. + */ + last_comp_ratio = 100UL; + } + cmdp->crb.gzip_fc = 0; + putnn(cmdp->crb, gzip_fc, fc); + + /* + * NX source buffers + */ + first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len); + last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len); + + if (first_used > 0) + nx_append_dde(ddl_in, fifo_in + cur_in, first_used); + + if (last_used > 0) + nx_append_dde(ddl_in, fifo_in, last_used); + + /* + * NX target buffers + */ + first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len); + last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len); + + /* Reduce output free space amount not to overwrite the history */ + int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len) + - (1<<16)); + + NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max, + target_max)); + + first_free = NX_MIN(target_max, first_free); + if (first_free > 0) { + first_offset = fifo_free_first_offset(cur_out, used_out); + nx_append_dde(ddl_out, fifo_out + first_offset, first_free); + } + + if (last_free > 0) { + last_free = NX_MIN(target_max - first_free, last_free); + if (last_free > 0) { + last_offset = fifo_free_last_offset(cur_out, used_out, + fifo_out_len); + nx_append_dde(ddl_out, fifo_out + last_offset, + last_free); + } + } + + /* Target buffer size is used to limit the source data size + * based on previous measurements of compression ratio. + */ + + /* source_sz includes history */ + source_sz = getp32(ddl_in, ddebc); + assert(source_sz > history_len); + source_sz = source_sz - history_len; + + /* Estimating how much source is needed to 3/4 fill a + * target_max size target buffer. If we overshoot, then NX + * must repeat the job with smaller input and we waste + * bandwidth. If we undershoot then we use more NX calls than + * necessary. + */ + + source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL) + / 4000; + + if (source_sz_estimate < source_sz) { + /* Target might be small, therefore limiting the + * source data. + */ + source_sz = source_sz_estimate; + target_sz_estimate = target_max; + } else { + /* Source file might be small, therefore limiting target + * touch pages to a smaller value to save processor cycles. + */ + target_sz_estimate = ((uint64_t)source_sz * 1000UL) + / (last_comp_ratio + 1); + target_sz_estimate = NX_MIN(2 * target_sz_estimate, + target_max); + } + + source_sz = source_sz + history_len; + + /* Some NX condition codes require submitting the NX job again. + * Kernel doesn't handle NX page faults. Expects user code to + * touch pages. + */ + pgfault_retries = NX_MAX_FAULTS; + +restart_nx: + + putp32(ddl_in, ddebc, source_sz); + + /* Fault in pages */ + nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1); + nx_touch_pages_dde(ddl_in, 0, page_sz, 0); + nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1); + + /* Send job to NX */ + cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle); + + switch (cc) { + + case ERR_NX_TRANSLATION: + + /* We touched the pages ahead of time. In the most common case + * we shouldn't be here. But may be some pages were paged out. + * Kernel should have placed the faulting address to fsaddr. + */ + NXPRT(fprintf(stderr, "ERR_NX_TRANSLATION %p\n", + (void *)cmdp->crb.csb.fsaddr)); + + if (pgfault_retries == NX_MAX_FAULTS) { + /* Try once with exact number of pages */ + --pgfault_retries; + goto restart_nx; + } else if (pgfault_retries > 0) { + /* If still faulting try fewer input pages + * assuming memory outage + */ + if (source_sz > page_sz) + source_sz = NX_MAX(source_sz / 2, page_sz); + --pgfault_retries; + goto restart_nx; + } else { + fprintf(stderr, "cannot make progress; too many "); + fprintf(stderr, "page fault retries cc= %d\n", cc); + rc = -1; + goto err5; + } + + case ERR_NX_DATA_LENGTH: + + NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; ")); + NXPRT(fprintf(stderr, "stream may have trailing data\n")); + + /* Not an error in the most common case; it just says + * there is trailing data that we must examine. + * + * CC=3 CE(1)=0 CE(0)=1 indicates partial completion + * Fig.6-7 and Table 6-8. + */ + nx_ce = get_csb_ce_ms3b(cmdp->crb.csb); + + if (!csb_ce_termination(nx_ce) && + csb_ce_partial_completion(nx_ce)) { + /* Check CPB for more information + * spbc and tpbc are valid + */ + sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */ + subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */ + spbc = get32(cmdp->cpb, out_spbc_decomp); + tpbc = get32(cmdp->crb.csb, tpbc); + assert(target_max >= tpbc); + + goto ok_cc3; /* not an error */ + } else { + /* History length error when CE(1)=1 CE(0)=0. */ + rc = -1; + fprintf(stderr, "history length error cc= %d\n", cc); + goto err5; + } + + case ERR_NX_TARGET_SPACE: + + /* Target buffer not large enough; retry smaller input + * data; give at least 1 byte. SPBC/TPBC are not valid. + */ + assert(source_sz > history_len); + source_sz = ((source_sz - history_len + 2) / 2) + history_len; + NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with ")); + NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n", + source_sz, history_len)); + goto restart_nx; + + case ERR_NX_OK: + + /* This should not happen for gzip formatted data; + * we need trailing crc and isize + */ + fprintf(stderr, "ERR_NX_OK\n"); + spbc = get32(cmdp->cpb, out_spbc_decomp); + tpbc = get32(cmdp->crb.csb, tpbc); + assert(target_max >= tpbc); + assert(spbc >= history_len); + source_sz = spbc - history_len; + goto offsets_state; + + default: + fprintf(stderr, "error: cc= %d\n", cc); + rc = -1; + goto err5; + } + +ok_cc3: + + NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt)); + + assert(spbc > history_len); + source_sz = spbc - history_len; + + /* Table 6-4: Source Final Block Type (SFBT) describes the + * last processed deflate block and clues the software how to + * resume the next job. SUBC indicates how many input bits NX + * consumed but did not process. SPBC indicates how many + * bytes of source were given to the accelerator including + * history bytes. + */ + + switch (sfbt) { + int dhtlen; + + case 0x0: /* Deflate final EOB received */ + + /* Calculating the checksum start position. */ + + source_sz = source_sz - subc / 8; + is_final = 1; + break; + + /* Resume decompression cases are below. Basically + * indicates where NX has suspended and how to resume + * the input stream. + */ + + case 0x8: /* Within a literal block; use rembytecount */ + case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */ + + /* Supply the partially processed source byte again */ + source_sz = source_sz - ((subc + 7) / 8); + + /* SUBC LS 3bits: number of bits in the first source byte need + * to be processed. + * 000 means all 8 bits; Table 6-3 + * Clear subc, histlen, sfbt, rembytecnt, dhtlen + */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb, + out_rembytecnt)); + break; + + case 0xA: /* Within a FH block; */ + case 0xB: /* Within a FH block; bfinal=1 */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + break; + + case 0xC: /* Within a DH block; */ + case 0xD: /* Within a DH block; bfinal=1 */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + + dhtlen = getnn(cmdp->cpb, out_dhtlen); + putnn(cmdp->cpb, in_dhtlen, dhtlen); + assert(dhtlen >= 42); + + /* Round up to a qword */ + dhtlen = (dhtlen + 127) / 128; + + while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */ + --dhtlen; + cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen]; + } + break; + + case 0xE: /* Within a block header; bfinal=0; */ + /* Also given if source data exactly ends (SUBC=0) with + * EOB code with BFINAL=0. Means the next byte will + * contain a block header. + */ + case 0xF: /* within a block header with BFINAL=1. */ + + source_sz = source_sz - ((subc + 7) / 8); + + /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */ + cmdp->cpb.in_subc = 0; + cmdp->cpb.in_sfbt = 0; + putnn(cmdp->cpb, in_subc, subc % 8); + putnn(cmdp->cpb, in_sfbt, sfbt); + + /* Engine did not process any data */ + if (is_eof && (source_sz == 0)) + is_final = 1; + } + +offsets_state: + + /* Adjust the source and target buffer offsets and lengths */ + + NXPRT(fprintf(stderr, "offsets_state:\n")); + + /* Delete input data from fifo_in */ + used_in = used_in - source_sz; + cur_in = (cur_in + source_sz) % fifo_in_len; + input_file_offset = input_file_offset + source_sz; + + /* Add output data to fifo_out */ + used_out = used_out + tpbc; + + assert(used_out <= fifo_out_len); + + total_out = total_out + tpbc; + + /* Deflate history is 32KB max. No need to supply more + * than 32KB on a resume. + */ + history_len = (total_out > window_max) ? window_max : total_out; + + /* To estimate expected expansion in the next NX job; 500 means 50%. + * Deflate best case is around 1 to 1000. + */ + last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1)) + / ((uint64_t)tpbc + 1); + last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1); + NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n", + last_comp_ratio, source_sz, spbc, tpbc)); + + resuming = 1; + +finish_state: + + NXPRT(fprintf(stderr, "finish_state:\n")); + + if (is_final) { + if (used_out) + goto write_state; /* More data to write out */ + else if (used_in < 8) { + /* Need at least 8 more bytes containing gzip crc + * and isize. + */ + rc = -1; + goto err4; + } else { + /* Compare checksums and exit */ + int i; + unsigned char tail[8]; + uint32_t cksum, isize; + + for (i = 0; i < 8; i++) + tail[i] = fifo_in[(cur_in + i) % fifo_in_len]; + fprintf(stderr, "computed checksum %08x isize %08x\n", + cmdp->cpb.out_crc, (uint32_t) (total_out + % (1ULL<<32))); + cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8 + | (uint32_t) tail[2]<<16 + | (uint32_t) tail[3]<<24); + isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8 + | (uint32_t) tail[6]<<16 + | (uint32_t) tail[7]<<24); + fprintf(stderr, "stored checksum %08x isize %08x\n", + cksum, isize); + + if (cksum == cmdp->cpb.out_crc && isize == (uint32_t) + (total_out % (1ULL<<32))) { + rc = 0; goto ok1; + } else { + rc = -1; goto err4; + } + } + } else + goto read_state; + + return -1; + +err1: + fprintf(stderr, "error: not a gzip file, expect %x, read %x\n", + expect, c); + return -1; + +err2: + fprintf(stderr, "error: the FLG byte is wrong or not being handled\n"); + return -1; + +err3: + fprintf(stderr, "error: gzip header\n"); + return -1; + +err4: + fprintf(stderr, "error: checksum missing or mismatch\n"); + +err5: +ok1: + fprintf(stderr, "decomp is complete: fclose\n"); + fclose(outf); + + return rc; +} + + +int main(int argc, char **argv) +{ + int rc; + struct sigaction act; + void *handle; + + nx_dbg = 0; + nx_gzip_log = NULL; + act.sa_handler = 0; + act.sa_sigaction = nxu_sigsegv_handler; + act.sa_flags = SA_SIGINFO; + act.sa_restorer = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + + handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); + if (!handle) { + fprintf(stderr, "Unable to init NX, errno %d\n", errno); + exit(-1); + } + + rc = decompress_file(argc, argv, handle); + + nx_function_end(handle); + + return rc; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c new file mode 100644 index 000000000000..7496a83f9c9d --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* P9 gzip sample code for demonstrating the P9 NX hardware interface. + * Not intended for productive uses or for performance or compression + * ratio measurements. For simplicity of demonstration, this sample + * code compresses in to fixed Huffman blocks only (Deflate btype=1) + * and has very simple memory management. Dynamic Huffman blocks + * (Deflate btype=2) are more involved as detailed in the user guide. + * Note also that /dev/crypto/gzip, VAS and skiboot support are + * required. + * + * Copyright 2020 IBM Corp. + * + * https://github.com/libnxz/power-gzip for zlib api and other utils + * + * Author: Bulent Abali <abali@us.ibm.com> + * + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +#define _ISOC11_SOURCE // For aligned_alloc() +#define _DEFAULT_SOURCE // For endian.h + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "nxu.h" +#include "nx.h" + +int nx_dbg; +FILE *nx_gzip_log; + +#define NX_MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +#define FNAME_MAX 1024 +#define FEXT ".nx.gz" + +/* + * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure. + */ +static int compress_fht_sample(char *src, uint32_t srclen, char *dst, + uint32_t dstlen, int with_count, + struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + uint32_t fc; + + assert(!!cmdp); + + put32(cmdp->crb, gzip_fc, 0); /* clear */ + fc = (with_count) ? GZIP_FC_COMPRESS_RESUME_FHT_COUNT : + GZIP_FC_COMPRESS_RESUME_FHT; + putnn(cmdp->crb, gzip_fc, fc); + putnn(cmdp->cpb, in_histlen, 0); /* resuming with no history */ + memset((void *) &cmdp->crb.csb, 0, sizeof(cmdp->crb.csb)); + + /* Section 6.6 programming notes; spbc may be in two different + * places depending on FC. + */ + if (!with_count) + put32(cmdp->cpb, out_spbc_comp, 0); + else + put32(cmdp->cpb, out_spbc_comp_with_count, 0); + + /* Figure 6-3 6-4; CSB location */ + put64(cmdp->crb, csb_address, 0); + put64(cmdp->crb, csb_address, + (uint64_t) &cmdp->crb.csb & csb_address_mask); + + /* Source direct dde (scatter-gather list) */ + clear_dde(cmdp->crb.source_dde); + putnn(cmdp->crb.source_dde, dde_count, 0); + put32(cmdp->crb.source_dde, ddebc, srclen); + put64(cmdp->crb.source_dde, ddead, (uint64_t) src); + + /* Target direct dde (scatter-gather list) */ + clear_dde(cmdp->crb.target_dde); + putnn(cmdp->crb.target_dde, dde_count, 0); + put32(cmdp->crb.target_dde, ddebc, dstlen); + put64(cmdp->crb.target_dde, ddead, (uint64_t) dst); + + /* Submit the crb, the job descriptor, to the accelerator */ + return nxu_submit_job(cmdp, handle); +} + +/* + * Prepares a blank no filename no timestamp gzip header and returns + * the number of bytes written to buf. + * Gzip specification at https://tools.ietf.org/html/rfc1952 + */ +int gzip_header_blank(char *buf) +{ + int i = 0; + + buf[i++] = 0x1f; /* ID1 */ + buf[i++] = 0x8b; /* ID2 */ + buf[i++] = 0x08; /* CM */ + buf[i++] = 0x00; /* FLG */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x00; /* MTIME */ + buf[i++] = 0x04; /* XFL 4=fastest */ + buf[i++] = 0x03; /* OS UNIX */ + + return i; +} + +/* Caller must free the allocated buffer return nonzero on error. */ +int read_alloc_input_file(char *fname, char **buf, size_t *bufsize) +{ + struct stat statbuf; + FILE *fp; + char *p; + size_t num_bytes; + + if (stat(fname, &statbuf)) { + perror(fname); + return(-1); + } + fp = fopen(fname, "r"); + if (fp == NULL) { + perror(fname); + return(-1); + } + assert(NULL != (p = (char *) malloc(statbuf.st_size))); + num_bytes = fread(p, 1, statbuf.st_size, fp); + if (ferror(fp) || (num_bytes != statbuf.st_size)) { + perror(fname); + return(-1); + } + *buf = p; + *bufsize = num_bytes; + return 0; +} + +/* Returns nonzero on error */ +int write_output_file(char *fname, char *buf, size_t bufsize) +{ + FILE *fp; + size_t num_bytes; + + fp = fopen(fname, "w"); + if (fp == NULL) { + perror(fname); + return(-1); + } + num_bytes = fwrite(buf, 1, bufsize, fp); + if (ferror(fp) || (num_bytes != bufsize)) { + perror(fname); + return(-1); + } + fclose(fp); + return 0; +} + +/* + * Z_SYNC_FLUSH as described in zlib.h. + * Returns number of appended bytes + */ +int append_sync_flush(char *buf, int tebc, int final) +{ + uint64_t flush; + int shift = (tebc & 0x7); + + if (tebc > 0) { + /* Last byte is partially full */ + buf = buf - 1; + *buf = *buf & (unsigned char) ((1<<tebc)-1); + } else + *buf = 0; + flush = ((0x1ULL & final) << shift) | *buf; + shift = shift + 3; /* BFINAL and BTYPE written */ + shift = (shift <= 8) ? 8 : 16; + flush |= (0xFFFF0000ULL) << shift; /* Zero length block */ + shift = shift + 32; + while (shift > 0) { + *buf++ = (unsigned char) (flush & 0xffULL); + flush = flush >> 8; + shift = shift - 8; + } + return(((tebc > 5) || (tebc == 0)) ? 5 : 4); +} + +/* + * Final deflate block bit. This call assumes the block + * beginning is byte aligned. + */ +static void set_bfinal(void *buf, int bfinal) +{ + char *b = buf; + + if (bfinal) + *b = *b | (unsigned char) 0x01; + else + *b = *b & (unsigned char) 0xfe; +} + +int compress_file(int argc, char **argv, void *handle) +{ + char *inbuf, *outbuf, *srcbuf, *dstbuf; + char outname[FNAME_MAX]; + uint32_t srclen, dstlen; + uint32_t flushlen, chunk; + size_t inlen, outlen, dsttotlen, srctotlen; + uint32_t crc, spbc, tpbc, tebc; + int lzcounts = 0; + int cc; + int num_hdr_bytes; + struct nx_gzip_crb_cpb_t *cmdp; + uint32_t pagelen = 65536; + int fault_tries = NX_MAX_FAULTS; + + cmdp = (void *)(uintptr_t) + aligned_alloc(sizeof(struct nx_gzip_crb_cpb_t), + sizeof(struct nx_gzip_crb_cpb_t)); + + if (argc != 2) { + fprintf(stderr, "usage: %s <fname>\n", argv[0]); + exit(-1); + } + if (read_alloc_input_file(argv[1], &inbuf, &inlen)) + exit(-1); + fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen); + + /* Generous output buffer for header/trailer */ + outlen = 2 * inlen + 1024; + + assert(NULL != (outbuf = (char *)malloc(outlen))); + nxu_touch_pages(outbuf, outlen, pagelen, 1); + + /* Compress piecemeal in smallish chunks */ + chunk = 1<<22; + + /* Write the gzip header to the stream */ + num_hdr_bytes = gzip_header_blank(outbuf); + dstbuf = outbuf + num_hdr_bytes; + outlen = outlen - num_hdr_bytes; + dsttotlen = num_hdr_bytes; + + srcbuf = inbuf; + srctotlen = 0; + + /* Init the CRB, the coprocessor request block */ + memset(&cmdp->crb, 0, sizeof(cmdp->crb)); + + /* Initial gzip crc32 */ + put32(cmdp->cpb, in_crc, 0); + + while (inlen > 0) { + + /* Submit chunk size source data per job */ + srclen = NX_MIN(chunk, inlen); + /* Supply large target in case data expands */ + dstlen = NX_MIN(2*srclen, outlen); + + /* Page faults are handled by the user code */ + + /* Fault-in pages; an improved code wouldn't touch so + * many pages but would try to estimate the + * compression ratio and adjust both the src and dst + * touch amounts. + */ + nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), pagelen, + 1); + nxu_touch_pages(srcbuf, srclen, pagelen, 0); + nxu_touch_pages(dstbuf, dstlen, pagelen, 1); + + cc = compress_fht_sample( + srcbuf, srclen, + dstbuf, dstlen, + lzcounts, cmdp, handle); + + if (cc != ERR_NX_OK && cc != ERR_NX_TPBC_GT_SPBC && + cc != ERR_NX_TRANSLATION) { + fprintf(stderr, "nx error: cc= %d\n", cc); + exit(-1); + } + + /* Page faults are handled by the user code */ + if (cc == ERR_NX_TRANSLATION) { + NXPRT(fprintf(stderr, "page fault: cc= %d, ", cc)); + NXPRT(fprintf(stderr, "try= %d, fsa= %08llx\n", + fault_tries, + (unsigned long long) cmdp->crb.csb.fsaddr)); + fault_tries--; + if (fault_tries > 0) { + continue; + } else { + fprintf(stderr, "error: cannot progress; "); + fprintf(stderr, "too many faults\n"); + exit(-1); + }; + } + + fault_tries = NX_MAX_FAULTS; /* Reset for the next chunk */ + + inlen = inlen - srclen; + srcbuf = srcbuf + srclen; + srctotlen = srctotlen + srclen; + + /* Two possible locations for spbc depending on the function + * code. + */ + spbc = (!lzcounts) ? get32(cmdp->cpb, out_spbc_comp) : + get32(cmdp->cpb, out_spbc_comp_with_count); + assert(spbc == srclen); + + /* Target byte count */ + tpbc = get32(cmdp->crb.csb, tpbc); + /* Target ending bit count */ + tebc = getnn(cmdp->cpb, out_tebc); + NXPRT(fprintf(stderr, "compressed chunk %d ", spbc)); + NXPRT(fprintf(stderr, "to %d bytes, tebc= %d\n", tpbc, tebc)); + + if (inlen > 0) { /* More chunks to go */ + set_bfinal(dstbuf, 0); + dstbuf = dstbuf + tpbc; + dsttotlen = dsttotlen + tpbc; + outlen = outlen - tpbc; + /* Round up to the next byte with a flush + * block; do not set the BFINAqL bit. + */ + flushlen = append_sync_flush(dstbuf, tebc, 0); + dsttotlen = dsttotlen + flushlen; + outlen = outlen - flushlen; + dstbuf = dstbuf + flushlen; + NXPRT(fprintf(stderr, "added sync_flush %d bytes\n", + flushlen)); + } else { /* Done */ + /* Set the BFINAL bit of the last block per Deflate + * specification. + */ + set_bfinal(dstbuf, 1); + dstbuf = dstbuf + tpbc; + dsttotlen = dsttotlen + tpbc; + outlen = outlen - tpbc; + } + + /* Resuming crc32 for the next chunk */ + crc = get32(cmdp->cpb, out_crc); + put32(cmdp->cpb, in_crc, crc); + crc = be32toh(crc); + } + + /* Append crc32 and ISIZE to the end */ + memcpy(dstbuf, &crc, 4); + memcpy(dstbuf+4, &srctotlen, 4); + dsttotlen = dsttotlen + 8; + outlen = outlen - 8; + + assert(FNAME_MAX > (strlen(argv[1]) + strlen(FEXT))); + strcpy(outname, argv[1]); + strcat(outname, FEXT); + if (write_output_file(outname, outbuf, dsttotlen)) { + fprintf(stderr, "write error: %s\n", outname); + exit(-1); + } + + fprintf(stderr, "compressed %ld to %ld bytes total, ", srctotlen, + dsttotlen); + fprintf(stderr, "crc32 checksum = %08x\n", crc); + + if (inbuf != NULL) + free(inbuf); + + if (outbuf != NULL) + free(outbuf); + + return 0; +} + +int main(int argc, char **argv) +{ + int rc; + struct sigaction act; + void *handle; + + nx_dbg = 0; + nx_gzip_log = NULL; + act.sa_handler = 0; + act.sa_sigaction = nxu_sigsegv_handler; + act.sa_flags = SA_SIGINFO; + act.sa_restorer = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + + handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0); + if (!handle) { + fprintf(stderr, "Unable to init NX, errno %d\n", errno); + exit(-1); + } + + rc = compress_file(argc, argv, handle); + + nx_function_end(handle); + + return rc; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c new file mode 100644 index 000000000000..c055885da40a --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* + * Copyright 2020 IBM Corp. + * + * Author: Bulent Abali <abali@us.ibm.com> + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/fcntl.h> +#include <sys/mman.h> +#include <endian.h> +#include <bits/endian.h> +#include <sys/ioctl.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include "vas-api.h" +#include "nx.h" +#include "copy-paste.h" +#include "nxu.h" +#include "nx_dbg.h" +#include <sys/platform/ppc.h> + +#define barrier() +#define hwsync() ({ asm volatile("sync" ::: "memory"); }) + +#ifndef NX_NO_CPU_PRI +#define cpu_pri_default() ({ asm volatile ("or 2, 2, 2"); }) +#define cpu_pri_low() ({ asm volatile ("or 31, 31, 31"); }) +#else +#define cpu_pri_default() +#define cpu_pri_low() +#endif + +void *nx_fault_storage_address; + +struct nx_handle { + int fd; + int function; + void *paste_addr; +}; + +static int open_device_nodes(char *devname, int pri, struct nx_handle *handle) +{ + int rc, fd; + void *addr; + struct vas_tx_win_open_attr txattr; + + fd = open(devname, O_RDWR); + if (fd < 0) { + fprintf(stderr, " open device name %s\n", devname); + return -errno; + } + + memset(&txattr, 0, sizeof(txattr)); + txattr.version = 1; + txattr.vas_id = pri; + rc = ioctl(fd, VAS_TX_WIN_OPEN, (unsigned long)&txattr); + if (rc < 0) { + fprintf(stderr, "ioctl() n %d, error %d\n", rc, errno); + rc = -errno; + goto out; + } + + addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0ULL); + if (addr == MAP_FAILED) { + fprintf(stderr, "mmap() failed, errno %d\n", errno); + rc = -errno; + goto out; + } + handle->fd = fd; + handle->paste_addr = (void *)((char *)addr + 0x400); + + rc = 0; +out: + close(fd); + return rc; +} + +void *nx_function_begin(int function, int pri) +{ + int rc; + char *devname = "/dev/crypto/nx-gzip"; + struct nx_handle *nxhandle; + + if (function != NX_FUNC_COMP_GZIP) { + errno = EINVAL; + fprintf(stderr, " NX_FUNC_COMP_GZIP not found\n"); + return NULL; + } + + + nxhandle = malloc(sizeof(*nxhandle)); + if (!nxhandle) { + errno = ENOMEM; + fprintf(stderr, " No memory\n"); + return NULL; + } + + nxhandle->function = function; + rc = open_device_nodes(devname, pri, nxhandle); + if (rc < 0) { + errno = -rc; + fprintf(stderr, " open_device_nodes failed\n"); + return NULL; + } + + return nxhandle; +} + +int nx_function_end(void *handle) +{ + int rc = 0; + struct nx_handle *nxhandle = handle; + + rc = munmap(nxhandle->paste_addr - 0x400, 4096); + if (rc < 0) { + fprintf(stderr, "munmap() failed, errno %d\n", errno); + return rc; + } + close(nxhandle->fd); + free(nxhandle); + + return rc; +} + +static int nx_wait_for_csb(struct nx_gzip_crb_cpb_t *cmdp) +{ + long poll = 0; + uint64_t t; + + /* Save power and let other threads use the h/w. top may show + * 100% but only because OS doesn't know we slowed the this + * h/w thread while polling. We're letting other threads have + * higher throughput on the core. + */ + cpu_pri_low(); + +#define CSB_MAX_POLL 200000000UL +#define USLEEP_TH 300000UL + + t = __ppc_get_timebase(); + + while (getnn(cmdp->crb.csb, csb_v) == 0) { + ++poll; + hwsync(); + + cpu_pri_low(); + + /* usleep(0) takes around 29000 ticks ~60 us. + * 300000 is spinning for about 600 us then + * start sleeping. + */ + if ((__ppc_get_timebase() - t) > USLEEP_TH) { + cpu_pri_default(); + usleep(1); + } + + if (poll > CSB_MAX_POLL) + break; + + /* Fault address from signal handler */ + if (nx_fault_storage_address) { + cpu_pri_default(); + return -EAGAIN; + } + + } + + cpu_pri_default(); + + /* hw has updated csb and output buffer */ + hwsync(); + + /* Check CSB flags. */ + if (getnn(cmdp->crb.csb, csb_v) == 0) { + fprintf(stderr, "CSB still not valid after %d polls.\n", + (int) poll); + prt_err("CSB still not valid after %d polls, giving up.\n", + (int) poll); + return -ETIMEDOUT; + } + + return 0; +} + +static int nxu_run_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + int i, ret, retries; + struct nx_handle *nxhandle = handle; + + assert(handle != NULL); + i = 0; + retries = 5000; + while (i++ < retries) { + hwsync(); + vas_copy(&cmdp->crb, 0); + ret = vas_paste(nxhandle->paste_addr, 0); + hwsync(); + + NXPRT(fprintf(stderr, "Paste attempt %d/%d returns 0x%x\n", + i, retries, ret)); + + if ((ret == 2) || (ret == 3)) { + + ret = nx_wait_for_csb(cmdp); + if (!ret) { + goto out; + } else if (ret == -EAGAIN) { + long x; + + prt_err("Touching address %p, 0x%lx\n", + nx_fault_storage_address, + *(long *) nx_fault_storage_address); + x = *(long *) nx_fault_storage_address; + *(long *) nx_fault_storage_address = x; + nx_fault_storage_address = 0; + continue; + } else { + prt_err("wait_for_csb() returns %d\n", ret); + break; + } + } else { + if (i < 10) { + /* spin for few ticks */ +#define SPIN_TH 500UL + uint64_t fail_spin; + + fail_spin = __ppc_get_timebase(); + while ((__ppc_get_timebase() - fail_spin) < + SPIN_TH) + ; + } else { + /* sleep */ + unsigned int pr = 0; + + if (pr++ % 100 == 0) { + prt_err("Paste attempt %d/", i); + prt_err("%d, failed pid= %d\n", retries, + getpid()); + } + usleep(1); + } + continue; + } + } + +out: + cpu_pri_default(); + + return ret; +} + +int nxu_submit_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle) +{ + int cc; + + cc = nxu_run_job(cmdp, handle); + + if (!cc) + cc = getnn(cmdp->crb.csb, csb_cc); /* CC Table 6-8 */ + + return cc; +} + + +void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx) +{ + fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(), + sig, info->si_code, info->si_addr); + + nx_fault_storage_address = info->si_addr; +} + +/* + * Fault in pages prior to NX job submission. wr=1 may be required to + * touch writeable pages. System zero pages do not fault-in the page as + * intended. Typically set wr=1 for NX target pages and set wr=0 for NX + * source pages. + */ +int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr) +{ + char *begin = buf; + char *end = (char *) buf + buf_len - 1; + volatile char t; + + assert(buf_len >= 0 && !!buf); + + NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf, + (buf + buf_len), buf_len, wr)); + + if (buf_len <= 0 || buf == NULL) + return -1; + + do { + t = *begin; + if (wr) + *begin = t; + begin = begin + page_len; + } while (begin < end); + + /* When buf_sz is small or buf tail is in another page */ + t = *end; + if (wr) + *end = t; + + return 0; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h new file mode 100644 index 000000000000..0db2d6485037 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/* From asm-compat.h */ +#define __stringify_in_c(...) #__VA_ARGS__ +#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " " + +/* + * Macros taken from arch/powerpc/include/asm/ppc-opcode.h and other + * header files. + */ +#define ___PPC_RA(a) (((a) & 0x1f) << 16) +#define ___PPC_RB(b) (((b) & 0x1f) << 11) + +#define PPC_INST_COPY 0x7c20060c +#define PPC_INST_PASTE 0x7c20070d + +#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_PASTE(a, b) stringify_in_c(.long PPC_INST_PASTE | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define CR0_SHIFT 28 +#define CR0_MASK 0xF +/* + * Copy/paste instructions: + * + * copy RA,RB + * Copy contents of address (RA) + effective_address(RB) + * to internal copy-buffer. + * + * paste RA,RB + * Paste contents of internal copy-buffer to the address + * (RA) + effective_address(RB) + */ +static inline int vas_copy(void *crb, int offset) +{ + asm volatile(PPC_COPY(%0, %1)";" + : + : "b" (offset), "b" (crb) + : "memory"); + + return 0; +} + +static inline int vas_paste(void *paste_address, int offset) +{ + __u32 cr; + + cr = 0; + asm volatile(PPC_PASTE(%1, %2)";" + "mfocrf %0, 0x80;" + : "=r" (cr) + : "b" (offset), "b" (paste_address) + : "memory", "cr0"); + + return (cr >> CR0_SHIFT) & CR0_MASK; +} diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/crb.h b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h new file mode 100644 index 000000000000..ab101085fa7e --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __CRB_H +#define __CRB_H +#include <linux/types.h> +#include "nx.h" + +/* CCW 842 CI/FC masks + * NX P8 workbook, section 4.3.1, figure 4-6 + * "CI/FC Boundary by NX CT type" + */ +#define CCW_CI_842 (0x00003ff8) +#define CCW_FC_842 (0x00000007) + +/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */ + +#define CCB_VALUE (0x3fffffffffffffff) +#define CCB_ADDRESS (0xfffffffffffffff8) +#define CCB_CM (0x0000000000000007) +#define CCB_CM0 (0x0000000000000004) +#define CCB_CM12 (0x0000000000000003) + +#define CCB_CM0_ALL_COMPLETIONS (0x0) +#define CCB_CM0_LAST_IN_CHAIN (0x4) +#define CCB_CM12_STORE (0x0) +#define CCB_CM12_INTERRUPT (0x1) + +#define CCB_SIZE (0x10) +#define CCB_ALIGN CCB_SIZE + +struct coprocessor_completion_block { + __be64 value; + __be64 address; +} __aligned(CCB_ALIGN); + + +/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */ + +#define CSB_V (0x80) +#define CSB_F (0x04) +#define CSB_CH (0x03) +#define CSB_CE_INCOMPLETE (0x80) +#define CSB_CE_TERMINATION (0x40) +#define CSB_CE_TPBC (0x20) + +#define CSB_CC_SUCCESS (0) +#define CSB_CC_INVALID_ALIGN (1) +#define CSB_CC_OPERAND_OVERLAP (2) +#define CSB_CC_DATA_LENGTH (3) +#define CSB_CC_TRANSLATION (5) +#define CSB_CC_PROTECTION (6) +#define CSB_CC_RD_EXTERNAL (7) +#define CSB_CC_INVALID_OPERAND (8) +#define CSB_CC_PRIVILEGE (9) +#define CSB_CC_INTERNAL (10) +#define CSB_CC_WR_EXTERNAL (12) +#define CSB_CC_NOSPC (13) +#define CSB_CC_EXCESSIVE_DDE (14) +#define CSB_CC_WR_TRANSLATION (15) +#define CSB_CC_WR_PROTECTION (16) +#define CSB_CC_UNKNOWN_CODE (17) +#define CSB_CC_ABORT (18) +#define CSB_CC_TRANSPORT (20) +#define CSB_CC_SEGMENTED_DDL (31) +#define CSB_CC_PROGRESS_POINT (32) +#define CSB_CC_DDE_OVERFLOW (33) +#define CSB_CC_SESSION (34) +#define CSB_CC_PROVISION (36) +#define CSB_CC_CHAIN (37) +#define CSB_CC_SEQUENCE (38) +#define CSB_CC_HW (39) + +#define CSB_SIZE (0x10) +#define CSB_ALIGN CSB_SIZE + +struct coprocessor_status_block { + __u8 flags; + __u8 cs; + __u8 cc; + __u8 ce; + __be32 count; + __be64 address; +} __aligned(CSB_ALIGN); + + +/* Chapter 6.5.10 Data-Descriptor List (DDL) + * each list contains one or more Data-Descriptor Entries (DDE) + */ + +#define DDE_P (0x8000) + +#define DDE_SIZE (0x10) +#define DDE_ALIGN DDE_SIZE + +struct data_descriptor_entry { + __be16 flags; + __u8 count; + __u8 index; + __be32 length; + __be64 address; +} __aligned(DDE_ALIGN); + + +/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */ + +#define CRB_SIZE (0x80) +#define CRB_ALIGN (0x100) /* Errata: requires 256 alignment */ + + +/* Coprocessor Status Block field + * ADDRESS address of CSB + * C CCB is valid + * AT 0 = addrs are virtual, 1 = addrs are phys + * M enable perf monitor + */ +#define CRB_CSB_ADDRESS (0xfffffffffffffff0) +#define CRB_CSB_C (0x0000000000000008) +#define CRB_CSB_AT (0x0000000000000002) +#define CRB_CSB_M (0x0000000000000001) + +struct coprocessor_request_block { + __be32 ccw; + __be32 flags; + __be64 csb_addr; + + struct data_descriptor_entry source; + struct data_descriptor_entry target; + + struct coprocessor_completion_block ccb; + + __u8 reserved[48]; + + struct coprocessor_status_block csb; +} __aligned(CRB_ALIGN); + +#define crb_csb_addr(c) __be64_to_cpu(c->csb_addr) +#define crb_nx_fault_addr(c) __be64_to_cpu(c->stamp.nx.fault_storage_addr) +#define crb_nx_flags(c) c->stamp.nx.flags +#define crb_nx_fault_status(c) c->stamp.nx.fault_status +#define crb_nx_pswid(c) c->stamp.nx.pswid + + +/* RFC02167 Initiate Coprocessor Instructions document + * Chapter 8.2.1.1.1 RS + * Chapter 8.2.3 Coprocessor Directive + * Chapter 8.2.4 Execution + * + * The CCW must be converted to BE before passing to icswx() + */ + +#define CCW_PS (0xff000000) +#define CCW_CT (0x00ff0000) +#define CCW_CD (0x0000ffff) +#define CCW_CL (0x0000c000) + +#endif diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h new file mode 100644 index 000000000000..1abe23fc29e8 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020 IBM Corp. + * + */ +#ifndef _NX_H +#define _NX_H + +#include <stdbool.h> + +#define NX_FUNC_COMP_842 1 +#define NX_FUNC_COMP_GZIP 2 + +#ifndef __aligned +#define __aligned(x) __attribute__((aligned(x))) +#endif + +struct nx842_func_args { + bool use_crc; + bool decompress; /* true decompress; false compress */ + bool move_data; + int timeout; /* seconds */ +}; + +struct nxbuf_t { + int len; + char *buf; +}; + +/* @function should be EFT (aka 842), GZIP etc */ +void *nx_function_begin(int function, int pri); + +int nx_function(void *handle, struct nxbuf_t *in, struct nxbuf_t *out, + void *arg); + +int nx_function_end(void *handle); + +#endif /* _NX_H */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h new file mode 100644 index 000000000000..16464e19c47f --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2020 IBM Corporation + * + */ + +#ifndef _NXU_DBG_H_ +#define _NXU_DBG_H_ + +#include <sys/file.h> +#include <stdint.h> +#include <stdio.h> +#include <time.h> +#include <pthread.h> + +extern FILE * nx_gzip_log; +extern int nx_gzip_trace; +extern unsigned int nx_gzip_inflate_impl; +extern unsigned int nx_gzip_deflate_impl; +extern unsigned int nx_gzip_inflate_flags; +extern unsigned int nx_gzip_deflate_flags; + +extern int nx_dbg; +pthread_mutex_t mutex_log; + +#define nx_gzip_trace_enabled() (nx_gzip_trace & 0x1) +#define nx_gzip_hw_trace_enabled() (nx_gzip_trace & 0x2) +#define nx_gzip_sw_trace_enabled() (nx_gzip_trace & 0x4) +#define nx_gzip_gather_statistics() (nx_gzip_trace & 0x8) +#define nx_gzip_per_stream_stat() (nx_gzip_trace & 0x10) + +#define prt(fmt, ...) do { \ + pthread_mutex_lock(&mutex_log); \ + flock(nx_gzip_log->_fileno, LOCK_EX); \ + time_t t; struct tm *m; time(&t); m = localtime(&t); \ + fprintf(nx_gzip_log, "[%04d/%02d/%02d %02d:%02d:%02d] " \ + "pid %d: " fmt, \ + (int)m->tm_year + 1900, (int)m->tm_mon+1, (int)m->tm_mday, \ + (int)m->tm_hour, (int)m->tm_min, (int)m->tm_sec, \ + (int)getpid(), ## __VA_ARGS__); \ + fflush(nx_gzip_log); \ + flock(nx_gzip_log->_fileno, LOCK_UN); \ + pthread_mutex_unlock(&mutex_log); \ +} while (0) + +/* Use in case of an error */ +#define prt_err(fmt, ...) do { if (nx_dbg >= 0) { \ + prt("%s:%u: Error: "fmt, \ + __FILE__, __LINE__, ## __VA_ARGS__); \ +}} while (0) + +/* Use in case of an warning */ +#define prt_warn(fmt, ...) do { if (nx_dbg >= 1) { \ + prt("%s:%u: Warning: "fmt, \ + __FILE__, __LINE__, ## __VA_ARGS__); \ +}} while (0) + +/* Informational printouts */ +#define prt_info(fmt, ...) do { if (nx_dbg >= 2) { \ + prt("Info: "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace zlib wrapper code */ +#define prt_trace(fmt, ...) do { if (nx_gzip_trace_enabled()) { \ + prt("### "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace statistics */ +#define prt_stat(fmt, ...) do { if (nx_gzip_gather_statistics()) { \ + prt("### "fmt, ## __VA_ARGS__); \ +}} while (0) + +/* Trace zlib hardware implementation */ +#define hw_trace(fmt, ...) do { \ + if (nx_gzip_hw_trace_enabled()) \ + fprintf(nx_gzip_log, "hhh " fmt, ## __VA_ARGS__); \ + } while (0) + +/* Trace zlib software implementation */ +#define sw_trace(fmt, ...) do { \ + if (nx_gzip_sw_trace_enabled()) \ + fprintf(nx_gzip_log, "sss " fmt, ## __VA_ARGS__); \ + } while (0) + + +/** + * str_to_num - Convert string into number and copy with endings like + * KiB for kilobyte + * MiB for megabyte + * GiB for gigabyte + */ +uint64_t str_to_num(char *str); +void nx_lib_debug(int onoff); + +#endif /* _NXU_DBG_H_ */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h new file mode 100644 index 000000000000..20a4e883e0d3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h @@ -0,0 +1,650 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Hardware interface of the NX-GZIP compression accelerator + * + * Copyright (C) IBM Corporation, 2020 + * + * Author: Bulent Abali <abali@us.ibm.com> + * + */ + +#ifndef _NXU_H +#define _NXU_H + +#include <stdint.h> +#include <endian.h> +#include "nx.h" + +/* deflate */ +#define LLSZ 286 +#define DSZ 30 + +/* nx */ +#define DHTSZ 18 +#define DHT_MAXSZ 288 +#define MAX_DDE_COUNT 256 + +/* util */ +#ifdef NXDBG +#define NXPRT(X) X +#else +#define NXPRT(X) +#endif + +#ifdef NXTIMER +#include <sys/platform/ppc.h> +#define NX_CLK(X) X +#define nx_get_time() __ppc_get_timebase() +#define nx_get_freq() __ppc_get_timebase_freq() +#else +#define NX_CLK(X) +#define nx_get_time() (-1) +#define nx_get_freq() (-1) +#endif + +#define NX_MAX_FAULTS 500 + +/* + * Definitions of acronyms used here. See + * P9 NX Gzip Accelerator User's Manual for details: + * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf + * + * adler/crc: 32 bit checksums appended to stream tail + * ce: completion extension + * cpb: coprocessor parameter block (metadata) + * crb: coprocessor request block (command) + * csb: coprocessor status block (status) + * dht: dynamic huffman table + * dde: data descriptor element (address, length) + * ddl: list of ddes + * dh/fh: dynamic and fixed huffman types + * fc: coprocessor function code + * histlen: history/dictionary length + * history: sliding window of up to 32KB of data + * lzcount: Deflate LZ symbol counts + * rembytecnt: remaining byte count + * sfbt: source final block type; last block's type during decomp + * spbc: source processed byte count + * subc: source unprocessed bit count + * tebc: target ending bit count; valid bits in the last byte + * tpbc: target processed byte count + * vas: virtual accelerator switch; the user mode interface + */ + +union nx_qw_t { + uint32_t word[4]; + uint64_t dword[2]; +} __aligned(16); + +/* + * Note: NX registers with fewer than 32 bits are declared by + * convention as uint32_t variables in unions. If *_offset and *_mask + * are defined for a variable, then use get_ put_ macros to + * conveniently access the register fields for endian conversions. + */ + +struct nx_dde_t { + /* Data Descriptor Element, Section 6.4 */ + union { + uint32_t dde_count; + /* When dde_count == 0 ddead is a pointer to a data buffer; + * ddebc is the buffer length bytes. + * When dde_count > 0 dde is an indirect dde; ddead is a + * pointer to a contiguous list of direct ddes; ddebc is the + * total length of all data pointed to by the list of direct + * ddes. Note that only one level of indirection is permitted. + * See Section 6.4 of the user manual for additional details. + */ + }; + uint32_t ddebc; /* dde byte count */ + uint64_t ddead; /* dde address */ +} __aligned(16); + +struct nx_csb_t { + /* Coprocessor Status Block, Section 6.6 */ + union { + uint32_t csb_v; + /* Valid bit. v must be set to 0 by the program + * before submitting the coprocessor command. + * Software can poll for the v bit + */ + + uint32_t csb_f; + /* 16B CSB size. Written to 0 by DMA when it writes the CPB */ + + uint32_t csb_cs; + /* cs completion sequence; unused */ + + uint32_t csb_cc; + /* cc completion code; cc != 0 exception occurred */ + + uint32_t csb_ce; + /* ce completion extension */ + + }; + uint32_t tpbc; + /* target processed byte count TPBC */ + + uint64_t fsaddr; + /* Section 6.12.1 CSB NonZero error summary. FSA Failing storage + * address. Address where error occurred. When available, written + * to A field of CSB + */ +} __aligned(16); + +struct nx_ccb_t { + /* Coprocessor Completion Block, Section 6.7 */ + + uint32_t reserved[3]; + union { + /* When crb.c==0 (no ccb defined) it is reserved; + * When crb.c==1 (ccb defined) it is cm + */ + + uint32_t ccb_cm; + /* Signal interrupt of crb.c==1 and cm==1 */ + + uint32_t word; + /* generic access to the 32bit word */ + }; +} __aligned(16); + +struct vas_stamped_crb_t { + /* + * CRB operand of the paste coprocessor instruction is stamped + * in quadword 4 with the information shown here as its written + * in to the receive FIFO of the coprocessor + */ + + union { + uint32_t vas_buf_num; + /* Verification only vas buffer number which correlates to + * the low order bits of the atag in the paste command + */ + + uint32_t send_wc_id; + /* Pointer to Send Window Context that provides for NX address + * translation information, such as MSR and LPCR bits, job + * completion interrupt RA, PSWID, and job utilization counter. + */ + + }; + union { + uint32_t recv_wc_id; + /* Pointer to Receive Window Context. NX uses this to return + * credits to a Receive FIFO as entries are dequeued. + */ + + }; + uint32_t reserved2; + union { + uint32_t vas_invalid; + /* Invalid bit. If this bit is 1 the CRB is discarded by + * NX upon fetching from the receive FIFO. If this bit is 0 + * the CRB is processed normally. The bit is stamped to 0 + * by VAS and may be written to 1 by hypervisor while + * the CRB is in the receive FIFO (in memory). + */ + + }; +}; + +struct nx_stamped_fault_crb_t { + /* + * A CRB that has a translation fault is stamped by NX in quadword 4 + * and pasted to the Fault Send Window in VAS. + */ + uint64_t fsa; + union { + uint32_t nxsf_t; + uint32_t nxsf_fs; + }; + uint32_t pswid; +}; + +union stamped_crb_t { + struct vas_stamped_crb_t vas; + struct nx_stamped_fault_crb_t nx; +}; + +struct nx_gzip_cpb_t { + /* + * Coprocessor Parameter Block In/Out are used to pass metadata + * to/from accelerator. Tables 6.5 and 6.6 of the user manual. + */ + + /* CPBInput */ + + struct { + union { + union nx_qw_t qw0; + struct { + uint32_t in_adler; /* bits 0:31 */ + uint32_t in_crc; /* bits 32:63 */ + union { + uint32_t in_histlen; /* bits 64:75 */ + uint32_t in_subc; /* bits 93:95 */ + }; + union { + /* bits 108:111 */ + uint32_t in_sfbt; + /* bits 112:127 */ + uint32_t in_rembytecnt; + /* bits 116:127 */ + uint32_t in_dhtlen; + }; + }; + }; + union { + union nx_qw_t in_dht[DHTSZ]; /* qw[1:18] */ + char in_dht_char[DHT_MAXSZ]; /* byte access */ + }; + union nx_qw_t reserved[5]; /* qw[19:23] */ + }; + + /* CPBOutput */ + + volatile struct { + union { + union nx_qw_t qw24; + struct { + uint32_t out_adler; /* bits 0:31 qw[24] */ + uint32_t out_crc; /* bits 32:63 qw[24] */ + union { + /* bits 77:79 qw[24] */ + uint32_t out_tebc; + /* bits 80:95 qw[24] */ + uint32_t out_subc; + }; + union { + /* bits 108:111 qw[24] */ + uint32_t out_sfbt; + /* bits 112:127 qw[24] */ + uint32_t out_rembytecnt; + /* bits 116:127 qw[24] */ + uint32_t out_dhtlen; + }; + }; + }; + union { + union nx_qw_t qw25[79]; /* qw[25:103] */ + /* qw[25] compress no lzcounts or wrap */ + uint32_t out_spbc_comp_wrap; + uint32_t out_spbc_wrap; /* qw[25] wrap */ + /* qw[25] compress no lzcounts */ + uint32_t out_spbc_comp; + /* 286 LL and 30 D symbol counts */ + uint32_t out_lzcount[LLSZ+DSZ]; + struct { + union nx_qw_t out_dht[DHTSZ]; /* qw[25:42] */ + /* qw[43] decompress */ + uint32_t out_spbc_decomp; + }; + }; + /* qw[104] compress with lzcounts */ + uint32_t out_spbc_comp_with_count; + }; +} __aligned(128); + +struct nx_gzip_crb_t { + union { /* byte[0:3] */ + uint32_t gzip_fc; /* bits[24-31] */ + }; + uint32_t reserved1; /* byte[4:7] */ + union { + uint64_t csb_address; /* byte[8:15] */ + struct { + uint32_t reserved2; + union { + uint32_t crb_c; + /* c==0 no ccb defined */ + + uint32_t crb_at; + /* at==0 address type is ignored; + * all addrs effective assumed. + */ + + }; + }; + }; + struct nx_dde_t source_dde; /* byte[16:31] */ + struct nx_dde_t target_dde; /* byte[32:47] */ + volatile struct nx_ccb_t ccb; /* byte[48:63] */ + volatile union { + /* byte[64:239] shift csb by 128 bytes out of the crb; csb was + * in crb earlier; JReilly says csb written with partial inject + */ + union nx_qw_t reserved64[11]; + union stamped_crb_t stamp; /* byte[64:79] */ + }; + volatile struct nx_csb_t csb; +} __aligned(128); + +struct nx_gzip_crb_cpb_t { + struct nx_gzip_crb_t crb; + struct nx_gzip_cpb_t cpb; +} __aligned(2048); + + +/* + * NX hardware convention has the msb bit on the left numbered 0. + * The defines below has *_offset defined as the right most bit + * position of a field. x of size_mask(x) is the field width in bits. + */ + +#define size_mask(x) ((1U<<(x))-1) + +/* + * Offsets and Widths within the containing 32 bits of the various NX + * gzip hardware registers. Use the getnn/putnn macros to access + * these regs + */ + +#define dde_count_mask size_mask(8) +#define dde_count_offset 23 + +/* CSB */ + +#define csb_v_mask size_mask(1) +#define csb_v_offset 0 +#define csb_f_mask size_mask(1) +#define csb_f_offset 6 +#define csb_cs_mask size_mask(8) +#define csb_cs_offset 15 +#define csb_cc_mask size_mask(8) +#define csb_cc_offset 23 +#define csb_ce_mask size_mask(8) +#define csb_ce_offset 31 + +/* CCB */ + +#define ccb_cm_mask size_mask(3) +#define ccb_cm_offset 31 + +/* VAS stamped CRB fields */ + +#define vas_buf_num_mask size_mask(6) +#define vas_buf_num_offset 5 +#define send_wc_id_mask size_mask(16) +#define send_wc_id_offset 31 +#define recv_wc_id_mask size_mask(16) +#define recv_wc_id_offset 31 +#define vas_invalid_mask size_mask(1) +#define vas_invalid_offset 31 + +/* NX stamped fault CRB fields */ + +#define nxsf_t_mask size_mask(1) +#define nxsf_t_offset 23 +#define nxsf_fs_mask size_mask(8) +#define nxsf_fs_offset 31 + +/* CPB input */ + +#define in_histlen_mask size_mask(12) +#define in_histlen_offset 11 +#define in_dhtlen_mask size_mask(12) +#define in_dhtlen_offset 31 +#define in_subc_mask size_mask(3) +#define in_subc_offset 31 +#define in_sfbt_mask size_mask(4) +#define in_sfbt_offset 15 +#define in_rembytecnt_mask size_mask(16) +#define in_rembytecnt_offset 31 + +/* CPB output */ + +#define out_tebc_mask size_mask(3) +#define out_tebc_offset 15 +#define out_subc_mask size_mask(16) +#define out_subc_offset 31 +#define out_sfbt_mask size_mask(4) +#define out_sfbt_offset 15 +#define out_rembytecnt_mask size_mask(16) +#define out_rembytecnt_offset 31 +#define out_dhtlen_mask size_mask(12) +#define out_dhtlen_offset 31 + +/* CRB */ + +#define gzip_fc_mask size_mask(8) +#define gzip_fc_offset 31 +#define crb_c_mask size_mask(1) +#define crb_c_offset 28 +#define crb_at_mask size_mask(1) +#define crb_at_offset 30 +#define csb_address_mask ~(15UL) /* mask off bottom 4b */ + +/* + * Access macros for the registers. Do not access registers directly + * because of the endian conversion. P9 processor may run either as + * Little or Big endian. However the NX coprocessor regs are always + * big endian. + * Use the 32 and 64b macros to access respective + * register sizes. + * Use nn forms for the register fields shorter than 32 bits. + */ + +#define getnn(ST, REG) ((be32toh(ST.REG) >> (31-REG##_offset)) \ + & REG##_mask) +#define getpnn(ST, REG) ((be32toh((ST)->REG) >> (31-REG##_offset)) \ + & REG##_mask) +#define get32(ST, REG) (be32toh(ST.REG)) +#define getp32(ST, REG) (be32toh((ST)->REG)) +#define get64(ST, REG) (be64toh(ST.REG)) +#define getp64(ST, REG) (be64toh((ST)->REG)) + +#define unget32(ST, REG) (get32(ST, REG) & ~((REG##_mask) \ + << (31-REG##_offset))) +/* get 32bits less the REG field */ + +#define ungetp32(ST, REG) (getp32(ST, REG) & ~((REG##_mask) \ + << (31-REG##_offset))) +/* get 32bits less the REG field */ + +#define clear_regs(ST) memset((void *)(&(ST)), 0, sizeof(ST)) +#define clear_dde(ST) do { ST.dde_count = ST.ddebc = 0; ST.ddead = 0; \ + } while (0) +#define clearp_dde(ST) do { (ST)->dde_count = (ST)->ddebc = 0; \ + (ST)->ddead = 0; \ + } while (0) +#define clear_struct(ST) memset((void *)(&(ST)), 0, sizeof(ST)) +#define putnn(ST, REG, X) (ST.REG = htobe32(unget32(ST, REG) | (((X) \ + & REG##_mask) << (31-REG##_offset)))) +#define putpnn(ST, REG, X) ((ST)->REG = htobe32(ungetp32(ST, REG) \ + | (((X) & REG##_mask) << (31-REG##_offset)))) + +#define put32(ST, REG, X) (ST.REG = htobe32(X)) +#define putp32(ST, REG, X) ((ST)->REG = htobe32(X)) +#define put64(ST, REG, X) (ST.REG = htobe64(X)) +#define putp64(ST, REG, X) ((ST)->REG = htobe64(X)) + +/* + * Completion extension ce(0) ce(1) ce(2). Bits ce(3-7) + * unused. Section 6.6 Figure 6.7. + */ + +#define get_csb_ce(ST) ((uint32_t)getnn(ST, csb_ce)) +#define get_csb_ce_ms3b(ST) (get_csb_ce(ST) >> 5) +#define put_csb_ce_ms3b(ST, X) putnn(ST, csb_ce, ((uint32_t)(X) << 5)) + +#define CSB_CE_PARTIAL 0x4 +#define CSB_CE_TERMINATE 0x2 +#define CSB_CE_TPBC_VALID 0x1 + +#define csb_ce_termination(X) (!!((X) & CSB_CE_TERMINATE)) +/* termination, output buffers may be modified, SPBC/TPBC invalid Fig.6-7 */ + +#define csb_ce_check_completion(X) (!csb_ce_termination(X)) +/* if not terminated then check full or partial completion */ + +#define csb_ce_partial_completion(X) (!!((X) & CSB_CE_PARTIAL)) +#define csb_ce_full_completion(X) (!csb_ce_partial_completion(X)) +#define csb_ce_tpbc_valid(X) (!!((X) & CSB_CE_TPBC_VALID)) +/* TPBC indicates successfully stored data count */ + +#define csb_ce_default_err(X) csb_ce_termination(X) +/* most error CEs have CE(0)=0 and CE(1)=1 */ + +#define csb_ce_cc3_partial(X) csb_ce_partial_completion(X) +/* some CC=3 are partially completed, Table 6-8 */ + +#define csb_ce_cc64(X) ((X)&(CSB_CE_PARTIAL \ + | CSB_CE_TERMINATE) == 0) +/* Compression: when TPBC>SPBC then CC=64 Table 6-8; target didn't + * compress smaller than source. + */ + +/* Decompress SFBT combinations Tables 5-3, 6-4, 6-6 */ + +#define SFBT_BFINAL 0x1 +#define SFBT_LIT 0x4 +#define SFBT_FHT 0x5 +#define SFBT_DHT 0x6 +#define SFBT_HDR 0x7 + +/* + * NX gzip function codes. Table 6.2. + * Bits 0:4 are the FC. Bit 5 is used by the DMA controller to + * select one of the two Byte Count Limits. + */ + +#define GZIP_FC_LIMIT_MASK 0x01 +#define GZIP_FC_COMPRESS_FHT 0x00 +#define GZIP_FC_COMPRESS_DHT 0x02 +#define GZIP_FC_COMPRESS_FHT_COUNT 0x04 +#define GZIP_FC_COMPRESS_DHT_COUNT 0x06 +#define GZIP_FC_COMPRESS_RESUME_FHT 0x08 +#define GZIP_FC_COMPRESS_RESUME_DHT 0x0a +#define GZIP_FC_COMPRESS_RESUME_FHT_COUNT 0x0c +#define GZIP_FC_COMPRESS_RESUME_DHT_COUNT 0x0e +#define GZIP_FC_DECOMPRESS 0x10 +#define GZIP_FC_DECOMPRESS_SINGLE_BLK_N_SUSPEND 0x12 +#define GZIP_FC_DECOMPRESS_RESUME 0x14 +#define GZIP_FC_DECOMPRESS_RESUME_SINGLE_BLK_N_SUSPEND 0x16 +#define GZIP_FC_WRAP 0x1e + +#define fc_is_compress(fc) (((fc) & 0x10) == 0) +#define fc_has_count(fc) (fc_is_compress(fc) && (((fc) & 0x4) != 0)) + +/* CSB.CC Error codes */ + +#define ERR_NX_OK 0 +#define ERR_NX_ALIGNMENT 1 +#define ERR_NX_OPOVERLAP 2 +#define ERR_NX_DATA_LENGTH 3 +#define ERR_NX_TRANSLATION 5 +#define ERR_NX_PROTECTION 6 +#define ERR_NX_EXTERNAL_UE7 7 +#define ERR_NX_INVALID_OP 8 +#define ERR_NX_PRIVILEGE 9 +#define ERR_NX_INTERNAL_UE 10 +#define ERR_NX_EXTERN_UE_WR 12 +#define ERR_NX_TARGET_SPACE 13 +#define ERR_NX_EXCESSIVE_DDE 14 +#define ERR_NX_TRANSL_WR 15 +#define ERR_NX_PROTECT_WR 16 +#define ERR_NX_SUBFUNCTION 17 +#define ERR_NX_FUNC_ABORT 18 +#define ERR_NX_BYTE_MAX 19 +#define ERR_NX_CORRUPT_CRB 20 +#define ERR_NX_INVALID_CRB 21 +#define ERR_NX_INVALID_DDE 30 +#define ERR_NX_SEGMENTED_DDL 31 +#define ERR_NX_DDE_OVERFLOW 33 +#define ERR_NX_TPBC_GT_SPBC 64 +#define ERR_NX_MISSING_CODE 66 +#define ERR_NX_INVALID_DIST 67 +#define ERR_NX_INVALID_DHT 68 +#define ERR_NX_EXTERNAL_UE90 90 +#define ERR_NX_WDOG_TIMER 224 +#define ERR_NX_AT_FAULT 250 +#define ERR_NX_INTR_SERVER 252 +#define ERR_NX_UE253 253 +#define ERR_NX_NO_HW 254 +#define ERR_NX_HUNG_OP 255 +#define ERR_NX_END 256 + +/* initial values for non-resume operations */ +#define INIT_CRC 0 /* crc32(0L, Z_NULL, 0) */ +#define INIT_ADLER 1 /* adler32(0L, Z_NULL, 0) adler is initialized to 1 */ + +/* prototypes */ +int nxu_submit_job(struct nx_gzip_crb_cpb_t *c, void *handle); + +extern void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx); +extern int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr); + +/* caller supplies a print buffer 4*sizeof(crb) */ + +char *nx_crb_str(struct nx_gzip_crb_t *crb, char *prbuf); +char *nx_cpb_str(struct nx_gzip_cpb_t *cpb, char *prbuf); +char *nx_prt_hex(void *cp, int sz, char *prbuf); +char *nx_lzcount_str(struct nx_gzip_cpb_t *cpb, char *prbuf); +char *nx_strerror(int e); + +#ifdef NX_SIM +#include <stdio.h> +int nx_sim_init(void *ctx); +int nx_sim_end(void *ctx); +int nxu_run_sim_job(struct nx_gzip_crb_cpb_t *c, void *ctx); +#endif /* NX_SIM */ + +/* Deflate stream manipulation */ + +#define set_final_bit(x) (x |= (unsigned char)1) +#define clr_final_bit(x) (x &= ~(unsigned char)1) + +#define append_empty_fh_blk(p, b) do { *(p) = (2 | (1&(b))); *((p)+1) = 0; \ + } while (0) +/* append 10 bits 0000001b 00...... ; + * assumes appending starts on a byte boundary; b is the final bit. + */ + + +#ifdef NX_842 + +/* 842 Engine */ + +struct nx_eft_crb_t { + union { /* byte[0:3] */ + uint32_t eft_fc; /* bits[29-31] */ + }; + uint32_t reserved1; /* byte[4:7] */ + union { + uint64_t csb_address; /* byte[8:15] */ + struct { + uint32_t reserved2; + union { + uint32_t crb_c; + /* c==0 no ccb defined */ + + uint32_t crb_at; + /* at==0 address type is ignored; + * all addrs effective assumed. + */ + + }; + }; + }; + struct nx_dde_t source_dde; /* byte[16:31] */ + struct nx_dde_t target_dde; /* byte[32:47] */ + struct nx_ccb_t ccb; /* byte[48:63] */ + union { + union nx_qw_t reserved64[3]; /* byte[64:96] */ + }; + struct nx_csb_t csb; +} __aligned(128); + +/* 842 CRB */ + +#define EFT_FC_MASK size_mask(3) +#define EFT_FC_OFFSET 31 +#define EFT_FC_COMPRESS 0x0 +#define EFT_FC_COMPRESS_WITH_CRC 0x1 +#define EFT_FC_DECOMPRESS 0x2 +#define EFT_FC_DECOMPRESS_WITH_CRC 0x3 +#define EFT_FC_BLK_DATA_MOVE 0x4 +#endif /* NX_842 */ + +#endif /* _NXU_H */ diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h new file mode 120000 index 000000000000..77fb4c7236d0 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/uapi/asm/vas-api.h
\ No newline at end of file diff --git a/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh new file mode 100755 index 000000000000..c7b46c5fd7b3 --- /dev/null +++ b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ ! -w /dev/crypto/nx-gzip ]]; then + echo "Can't access /dev/crypto/nx-gzip, skipping" + echo "skip: $0" + exit 4 +fi + +set -e + +function cleanup +{ + rm -f nx-tempfile* +} + +trap cleanup EXIT + +function test_sizes +{ + local n=$1 + local fname="nx-tempfile.$n" + + for size in 4K 64K 1M 64M + do + echo "Testing $size ($n) ..." + dd if=/dev/urandom of=$fname bs=$size count=1 + ./gzfht_test $fname + ./gunz_test ${fname}.nx.gz + done +} + +echo "Doing basic test of different sizes ..." +test_sizes 0 + +echo "Running tests in parallel ..." +for i in {1..16} +do + test_sizes $i & +done + +wait + +echo "OK" + +exit 0 diff --git a/tools/testing/selftests/powerpc/pmu/.gitignore b/tools/testing/selftests/powerpc/pmu/.gitignore index ff7896903d7b..f69b1e2641a1 100644 --- a/tools/testing/selftests/powerpc/pmu/.gitignore +++ b/tools/testing/selftests/powerpc/pmu/.gitignore @@ -2,3 +2,4 @@ count_instructions l3_bank_test per_event_excludes +count_stcx_fail diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 19046db995fe..904672fb78dd 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -2,7 +2,7 @@ noarg: $(MAKE) -C ../ -TEST_GEN_PROGS := count_instructions l3_bank_test per_event_excludes +TEST_GEN_PROGS := count_instructions count_stcx_fail l3_bank_test per_event_excludes EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. @@ -13,8 +13,12 @@ all: $(TEST_GEN_PROGS) ebb $(TEST_GEN_PROGS): $(EXTRA_SOURCES) # loop.S can only be built 64-bit +$(OUTPUT)/count_instructions: CFLAGS += -m64 $(OUTPUT)/count_instructions: loop.S count_instructions.c $(EXTRA_SOURCES) - $(CC) $(CFLAGS) -m64 -o $@ $^ + +$(OUTPUT)/count_stcx_fail: CFLAGS += -m64 +$(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) + $(OUTPUT)/per_event_excludes: ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c new file mode 100644 index 000000000000..7b4ac4537702 --- /dev/null +++ b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c @@ -0,0 +1,161 @@ +/* + * Copyright 2013, Michael Ellerman, IBM Corp. + * Licensed under GPLv2. + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdbool.h> +#include <string.h> +#include <sys/prctl.h> + +#include "event.h" +#include "utils.h" +#include "lib.h" + +extern void thirty_two_instruction_loop_with_ll_sc(u64 loops, u64 *ll_sc_target); + +static void setup_event(struct event *e, u64 config, int type, char *name) +{ + event_init_opts(e, config, type, name); + + e->attr.disabled = 1; + e->attr.exclude_kernel = 1; + e->attr.exclude_hv = 1; + e->attr.exclude_idle = 1; +} + +static int do_count_loop(struct event *events, u64 instructions, + u64 overhead, bool report) +{ + s64 difference, expected; + double percentage; + u64 dummy; + + prctl(PR_TASK_PERF_EVENTS_ENABLE); + + /* Run for 1M instructions */ + thirty_two_instruction_loop_with_ll_sc(instructions >> 5, &dummy); + + prctl(PR_TASK_PERF_EVENTS_DISABLE); + + event_read(&events[0]); + event_read(&events[1]); + event_read(&events[2]); + + expected = instructions + overhead + (events[2].result.value * 10); + difference = events[0].result.value - expected; + percentage = (double)difference / events[0].result.value * 100; + + if (report) { + printf("-----\n"); + event_report(&events[0]); + event_report(&events[1]); + event_report(&events[2]); + + printf("Looped for %llu instructions, overhead %llu\n", instructions, overhead); + printf("Expected %llu\n", expected); + printf("Actual %llu\n", events[0].result.value); + printf("Delta %lld, %f%%\n", difference, percentage); + } + + event_reset(&events[0]); + event_reset(&events[1]); + event_reset(&events[2]); + + if (difference < 0) + difference = -difference; + + /* Tolerate a difference below 0.0001 % */ + difference *= 10000 * 100; + if (difference / events[0].result.value) + return -1; + + return 0; +} + +/* Count how many instructions it takes to do a null loop */ +static u64 determine_overhead(struct event *events) +{ + u64 current, overhead; + int i; + + do_count_loop(events, 0, 0, false); + overhead = events[0].result.value; + + for (i = 0; i < 100; i++) { + do_count_loop(events, 0, 0, false); + current = events[0].result.value; + if (current < overhead) { + printf("Replacing overhead %llu with %llu\n", overhead, current); + overhead = current; + } + } + + return overhead; +} + +#define PM_MRK_STCX_FAIL 0x03e158 +#define PM_STCX_FAIL 0x01e058 + +static int test_body(void) +{ + struct event events[3]; + u64 overhead; + + setup_event(&events[0], PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, "instructions"); + setup_event(&events[1], PERF_COUNT_HW_CPU_CYCLES, PERF_TYPE_HARDWARE, "cycles"); + setup_event(&events[2], PM_STCX_FAIL, PERF_TYPE_RAW, "stcx_fail"); + + if (event_open(&events[0])) { + perror("perf_event_open"); + return -1; + } + + if (event_open_with_group(&events[1], events[0].fd)) { + perror("perf_event_open"); + return -1; + } + + if (event_open_with_group(&events[2], events[0].fd)) { + perror("perf_event_open"); + return -1; + } + + overhead = determine_overhead(events); + printf("Overhead of null loop: %llu instructions\n", overhead); + + /* Run for 1Mi instructions */ + FAIL_IF(do_count_loop(events, 1000000, overhead, true)); + + /* Run for 10Mi instructions */ + FAIL_IF(do_count_loop(events, 10000000, overhead, true)); + + /* Run for 100Mi instructions */ + FAIL_IF(do_count_loop(events, 100000000, overhead, true)); + + /* Run for 1Bi instructions */ + FAIL_IF(do_count_loop(events, 1000000000, overhead, true)); + + /* Run for 16Bi instructions */ + FAIL_IF(do_count_loop(events, 16000000000, overhead, true)); + + /* Run for 64Bi instructions */ + FAIL_IF(do_count_loop(events, 64000000000, overhead, true)); + + event_close(&events[0]); + event_close(&events[1]); + + return 0; +} + +static int count_ll_sc(void) +{ + return eat_cpu(test_body); +} + +int main(void) +{ + return test_harness(count_ll_sc, "count_ll_sc"); +} diff --git a/tools/testing/selftests/powerpc/pmu/ebb/trace.h b/tools/testing/selftests/powerpc/pmu/ebb/trace.h index 7c0fb5d2bdb1..da2a3be5441f 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/trace.h +++ b/tools/testing/selftests/powerpc/pmu/ebb/trace.h @@ -18,7 +18,7 @@ struct trace_entry { u8 type; u8 length; - u8 data[0]; + u8 data[]; }; struct trace_buffer @@ -26,7 +26,7 @@ struct trace_buffer u64 size; bool overflow; void *tail; - u8 data[0]; + u8 data[]; }; struct trace_buffer *trace_buffer_allocate(u64 size); diff --git a/tools/testing/selftests/powerpc/pmu/loop.S b/tools/testing/selftests/powerpc/pmu/loop.S index 8cc9b5e2c9de..c52ba09b6fed 100644 --- a/tools/testing/selftests/powerpc/pmu/loop.S +++ b/tools/testing/selftests/powerpc/pmu/loop.S @@ -41,3 +41,38 @@ FUNC_START(thirty_two_instruction_loop) subi r3,r3,1 b FUNC_NAME(thirty_two_instruction_loop) FUNC_END(thirty_two_instruction_loop) + +FUNC_START(thirty_two_instruction_loop_with_ll_sc) + cmpdi r3,0 + beqlr + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 5 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 +1: ldarx r6,0,r4 # 10 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 15 + addi r5,r5,1 + addi r5,r5,1 + stdcx. r6,0,r4 + bne- 1b + addi r5,r5,1 # 20 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 25 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 + addi r5,r5,1 # 30 + subi r3,r3,1 + b FUNC_NAME(thirty_two_instruction_loop_with_ll_sc) +FUNC_END(thirty_two_instruction_loop_with_ll_sc) diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index 932a032bf036..d6ae54663aed 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso +TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm diff --git a/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c new file mode 100644 index 000000000000..e3972264615b --- /dev/null +++ b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c @@ -0,0 +1,174 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test that a syscall does not get restarted twice, handled by trap_norestart() + * + * Based on Al's description, and a test for the bug fixed in this commit: + * + * commit 9a81c16b527528ad307843be5571111aa8d35a80 + * Author: Al Viro <viro@zeniv.linux.org.uk> + * Date: Mon Sep 20 21:48:57 2010 +0100 + * + * powerpc: fix double syscall restarts + * + * Make sigreturn zero regs->trap, make do_signal() do the same on all + * paths. As it is, signal interrupting e.g. read() from fd 512 (== + * ERESTARTSYS) with another signal getting unblocked when the first + * handler finishes will lead to restart one insn earlier than it ought + * to. Same for multiple signals with in-kernel handlers interrupting + * that sucker at the same time. Same for multiple signals of any kind + * interrupting that sucker on 64bit... + */ +#define _GNU_SOURCE +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/syscall.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "utils.h" + +static void SIGUSR1_handler(int sig) +{ + kill(getpid(), SIGUSR2); + /* + * SIGUSR2 is blocked until the handler exits, at which point it will + * be raised again and think there is a restart to be done because the + * pending restarted syscall has 512 (ERESTARTSYS) in r3. The second + * restart will retreat NIP another 4 bytes to fail case branch. + */ +} + +static void SIGUSR2_handler(int sig) +{ +} + +static ssize_t raw_read(int fd, void *buf, size_t count) +{ + register long nr asm("r0") = __NR_read; + register long _fd asm("r3") = fd; + register void *_buf asm("r4") = buf; + register size_t _count asm("r5") = count; + + asm volatile( +" b 0f \n" +" b 1f \n" +" 0: sc 0 \n" +" bns 2f \n" +" neg %0,%0 \n" +" b 2f \n" +" 1: \n" +" li %0,%4 \n" +" 2: \n" + : "+r"(_fd), "+r"(nr), "+r"(_buf), "+r"(_count) + : "i"(-ENOANO) + : "memory", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "ctr", "cr0"); + + if (_fd < 0) { + errno = -_fd; + _fd = -1; + } + + return _fd; +} + +#define DATA "test 123" +#define DLEN (strlen(DATA)+1) + +int test_restart(void) +{ + int pipefd[2]; + pid_t pid; + char buf[512]; + + if (pipe(pipefd) == -1) { + perror("pipe"); + exit(EXIT_FAILURE); + } + + pid = fork(); + if (pid == -1) { + perror("fork"); + exit(EXIT_FAILURE); + } + + if (pid == 0) { /* Child reads from pipe */ + struct sigaction act; + int fd; + + memset(&act, 0, sizeof(act)); + sigaddset(&act.sa_mask, SIGUSR2); + act.sa_handler = SIGUSR1_handler; + act.sa_flags = SA_RESTART; + if (sigaction(SIGUSR1, &act, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + memset(&act, 0, sizeof(act)); + act.sa_handler = SIGUSR2_handler; + act.sa_flags = SA_RESTART; + if (sigaction(SIGUSR2, &act, NULL) == -1) { + perror("sigaction"); + exit(EXIT_FAILURE); + } + + /* Let's get ERESTARTSYS into r3 */ + while ((fd = dup(pipefd[0])) != 512) { + if (fd == -1) { + perror("dup"); + exit(EXIT_FAILURE); + } + } + + if (raw_read(fd, buf, 512) == -1) { + if (errno == ENOANO) { + fprintf(stderr, "Double restart moved restart before sc instruction.\n"); + _exit(EXIT_FAILURE); + } + perror("read"); + exit(EXIT_FAILURE); + } + + if (strncmp(buf, DATA, DLEN)) { + fprintf(stderr, "bad test string %s\n", buf); + exit(EXIT_FAILURE); + } + + return 0; + + } else { + int wstatus; + + usleep(100000); /* Hack to get reader waiting */ + kill(pid, SIGUSR1); + usleep(100000); + if (write(pipefd[1], DATA, DLEN) != DLEN) { + perror("write"); + exit(EXIT_FAILURE); + } + close(pipefd[0]); + close(pipefd[1]); + if (wait(&wstatus) == -1) { + perror("wait"); + exit(EXIT_FAILURE); + } + if (!WIFEXITED(wstatus)) { + fprintf(stderr, "child exited abnormally\n"); + exit(EXIT_FAILURE); + } + + FAIL_IF(WEXITSTATUS(wstatus) != EXIT_SUCCESS); + + return 0; + } +} + +int main(void) +{ + test_harness_set_timeout(10); + return test_harness(test_restart, "sig sys restart"); +} diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 4bca5a9327a4..bed4b5318a86 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -2,7 +2,9 @@ /fd-001-lookup /fd-002-posix-eq /fd-003-kthread +/proc-fsconfig-hidepid /proc-loadavg-001 +/proc-multiple-procfs /proc-pid-vm /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index a8ed0f684829..8be8a03d2973 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -19,5 +19,7 @@ TEST_GEN_PROGS += self TEST_GEN_PROGS += setns-dcache TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self +TEST_GEN_PROGS += proc-multiple-procfs +TEST_GEN_PROGS += proc-fsconfig-hidepid include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-fsconfig-hidepid.c b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c new file mode 100644 index 000000000000..b9af8f537185 --- /dev/null +++ b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c @@ -0,0 +1,50 @@ +/* + * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <unistd.h> +#include <stdlib.h> +#include <errno.h> +#include <linux/mount.h> +#include <linux/unistd.h> + +static inline int fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} + +static inline int fsconfig(int fd, unsigned int cmd, const char *key, const void *val, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, val, aux); +} + +int main(void) +{ + int fsfd, ret; + int hidepid = 2; + + assert((fsfd = fsopen("proc", 0)) != -1); + + ret = fsconfig(fsfd, FSCONFIG_SET_BINARY, "hidepid", &hidepid, 0); + assert(ret == -1); + assert(errno == EINVAL); + + assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "2", 0)); + assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "invisible", 0)); + + assert(!close(fsfd)); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-multiple-procfs.c b/tools/testing/selftests/proc/proc-multiple-procfs.c new file mode 100644 index 000000000000..ab912ad95dab --- /dev/null +++ b/tools/testing/selftests/proc/proc-multiple-procfs.c @@ -0,0 +1,48 @@ +/* + * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/mount.h> +#include <sys/types.h> +#include <sys/stat.h> + +int main(void) +{ + struct stat proc_st1, proc_st2; + char procbuff[] = "/tmp/proc.XXXXXX/meminfo"; + char procdir1[] = "/tmp/proc.XXXXXX"; + char procdir2[] = "/tmp/proc.XXXXXX"; + + assert(mkdtemp(procdir1) != NULL); + assert(mkdtemp(procdir2) != NULL); + + assert(!mount("proc", procdir1, "proc", 0, "hidepid=1")); + assert(!mount("proc", procdir2, "proc", 0, "hidepid=2")); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir1); + assert(!stat(procbuff, &proc_st1)); + + snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir2); + assert(!stat(procbuff, &proc_st2)); + + umount(procdir1); + umount(procdir2); + + assert(proc_st1.st_dev != proc_st2.st_dev); + + return 0; +} diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 4f1831e62ea5..849e8226395a 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only hugepage-mmap hugepage-shm +khugepaged map_hugetlb map_populate thuge-gen @@ -9,6 +10,7 @@ mlock2-tests mremap_dontunmap on-fault-limit transhuge-stress +protection_keys userfaultfd mlock-intersect-test mlock-random-test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c6eb5305a0f6..a9026706d597 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -20,6 +20,31 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += khugepaged + +ifeq ($(ARCH),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_program.c -no-pie) + +TARGETS := protection_keys +BINARIES_32 := $(TARGETS:%=%_32) +BINARIES_64 := $(TARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else +TEST_GEN_FILES += protection_keys +endif ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch @@ -36,6 +61,55 @@ include ../lib.mk $(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread +ifeq ($(ARCH),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): %_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): %_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + $(OUTPUT)/userfaultfd: LDLIBS += -lpthread $(OUTPUT)/mlock-random-test: LDLIBS += -lcap diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c new file mode 100644 index 000000000000..51b89cedd09d --- /dev/null +++ b/tools/testing/selftests/vm/khugepaged.c @@ -0,0 +1,1035 @@ +#define _GNU_SOURCE +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> + +#include <sys/mman.h> +#include <sys/wait.h> + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool debug_cow; + bool use_zero_page; + struct khugepaged_settings khugepaged; +}; + +static struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .debug_cow = 0, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + ret = read_file(path, buf, sizeof(buf)); + if (ret < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + char buf[21]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("debug_cow", settings->debug_cow); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .debug_cow = read_num("debug_cow"), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void adjust_settings(void) +{ + + printf("Adjust settings..."); + write_settings(&default_settings); + success("OK"); +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static bool check_huge(void *addr) +{ + bool thp = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", + hpage_pmd_size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the AnonHugePages: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + thp = true; +err_out: + fclose(fp); + return thp; +} + + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer)) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(void) +{ + void *p; + + p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (check_huge(p)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (check_huge(p)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + return !timeout; +} + +static void alloc_at_fault(void) +{ + struct settings settings = default_settings; + char *p; + + settings.thp_enabled = THP_ALWAYS; + write_settings(&settings); + + p = alloc_mapping(); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + write_settings(&default_settings); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + if (wait_for_scan("Collapse fully populated PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_empty(void) +{ + void *p; + + p = alloc_mapping(); + if (wait_for_scan("Do not collapse empty PTE table", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(void) +{ + void *p; + + p = alloc_mapping(); + fill_memory(p, 0, page_size); + if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(void) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = default_settings; + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + write_settings(&settings); + + p = alloc_mapping(); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + + munmap(p, hpage_pmd_size); + write_settings(&default_settings); +} + +static void collapse_swapin_single_pte(void) +{ + void *p; + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with swapping in single PTE entry", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(void) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = alloc_mapping(); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) + fail("Timeout"); + else if (check_huge(p)) + fail("Fail"); + else + success("OK"); + validate_memory(p, 0, hpage_pmd_size); + + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); +out: + munmap(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(void) +{ + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(void) +{ + void *p; + int i; + + p = alloc_mapping(); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(BASE_ADDR, 0, hpage_pmd_size); + if (!check_huge(BASE_ADDR)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + munmap(BASE_ADDR, hpage_pmd_size); + fill_memory(p, 0, hpage_pmd_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Collapse PTE table full of different compound pages", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate small page..."); + fill_memory(p, 0, page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + fill_memory(p, page_size, 2 * page_size); + + if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_fork_compound(void) +{ + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + fill_memory(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + write_num("khugepaged/max_ptes_shared", + default_settings.khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared() +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) + fail("Timeout"); + else if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + + if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +int main(void) +{ + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + adjust_settings(); + + alloc_at_fault(); + collapse_full(); + collapse_empty(); + collapse_single_pte_entry(); + collapse_max_ptes_none(); + collapse_swapin_single_pte(); + collapse_max_ptes_swap(); + collapse_single_pte_entry_compound(); + collapse_full_of_compound(); + collapse_compound_extreme(); + collapse_fork(); + collapse_fork_compound(); + collapse_max_ptes_shared(); + + restore_settings(0); +} diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c index ee06cb0b9efb..3a7b5ef0b0c6 100644 --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ b/tools/testing/selftests/vm/mremap_dontunmap.c @@ -11,7 +11,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stdlib.h> #include <unistd.h> #include "../kselftest.h" diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h new file mode 100644 index 000000000000..622a85848f61 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-helpers.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); + write_pkey_reg(pkey_reg); +} + +static inline void __pkey_write_allow(int pkey, int do_allow_write) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2 + 1; + + if (do_allow_write) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + write_pkey_reg(pkey_reg); + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); +} + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h new file mode 100644 index 000000000000..1ebb586b2fbc --- /dev/null +++ b/tools/testing/selftests/vm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h new file mode 100644 index 000000000000..3be20f5d5275 --- /dev/null +++ b/tools/testing/selftests/vm/pkey-x86.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + eax = 0x7; + ecx = 0x0; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + eax = XSTATE_CPUID; + ecx = leaf; + __cpuid(&eax, &ebx, &ecx, &edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c index 480995bceefa..fc19addcb5c8 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/vm/protection_keys.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * Tests Memory Protection Keys (see Documentation/vm/protection-keys.txt) * * There are examples in here of: * * how to set protection keys on memory - * * how to set/clear bits in PKRU (the rights register) - * * how to handle SEGV_PKRU signals and extract pkey-relevant + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant * information from the siginfo * * Things to add: @@ -22,8 +22,10 @@ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm */ #define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ #include <errno.h> #include <linux/futex.h> +#include <time.h> #include <sys/time.h> #include <sys/syscall.h> #include <string.h> @@ -48,34 +50,10 @@ int iteration_nr = 1; int test_nr; -unsigned int shadow_pkru; - -#define HPAGE_SIZE (1UL<<21) -#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - +u64 shadow_pkey_reg; int dprint_in_signal; char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - void cat_into_file(char *str, char *file) { int fd = open(file, O_RDWR); @@ -158,12 +136,6 @@ void abort_hooks(void) #endif } -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - /* * This attempts to have roughly a page of instructions followed by a few * instructions that do a write, and another page of instructions. That @@ -174,7 +146,12 @@ static inline void __page_o_noops(void) * will then fault, which makes sure that the fault code handles * execute-only memory properly. */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else __attribute__((__aligned__(PAGE_SIZE))) +#endif void lots_o_noops_around_write(int *write_to_me) { dprintf3("running %s()\n", __func__); @@ -186,51 +163,134 @@ void lots_o_noops_around_write(int *write_to_me) dprintf3("%s() done\n", __func__); } -/* Define some kernel-like types */ -#define u8 uint8_t -#define u16 uint16_t -#define u32 uint32_t -#define u64 uint64_t +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; -#ifdef __i386__ + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 + return (u32) get_pkey_bits(pkey_reg, pkey); +} -#else +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 + __write_pkey_reg(new_pkey_reg); -#endif + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} -void dump_mem(void *dumpme, int len_bytes) +void pkey_disable_set(int pkey, int flags) { - char *c = (void *)dumpme; - int i; + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr); - } + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); } /* Failed address bound checks: */ @@ -255,7 +315,7 @@ static char *si_code_str(int si_code) return "UNKNOWN"; } -int pkru_faults; +int pkey_faults; int last_si_pkey = -1; void signal_handler(int signum, siginfo_t *si, void *vucontext) { @@ -263,24 +323,28 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) int trapno; unsigned long ip; char *fpregs; - u32 *pkru_ptr; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ u64 siginfo_pkey; u32 *si_pkey_ptr; - int pkru_offset; - fpregset_t fpregset; dprint_in_signal = 1; dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__, - __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregset = uctxt->uc_mcontext.fpregs; - fpregs = (void *)fpregset; + fpregs = (char *) uctxt->uc_mcontext.fpregs; - dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__, - trapno, ip, si_code_str(si->si_code), si->si_code); + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ #ifdef __i386__ /* * 32-bit has some extra padding so that userspace can tell whether @@ -288,20 +352,22 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) * state. We just assume that it is here. */ fpregs += 0x70; -#endif - pkru_offset = pkru_xstate_offset(); - pkru_ptr = (void *)(&fpregs[pkru_offset]); +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); /* - * If we got a PKRU fault, we *HAVE* to have at least one bit set in + * If we got a PKEY fault, we *HAVE* to have at least one bit set in * here. */ - dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset()); + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); if (DEBUG_LEVEL > 4) - dump_mem(pkru_ptr - 128, 256); - pkey_assert(*pkru_ptr); + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); if ((si->si_code == SEGV_MAPERR) || (si->si_code == SEGV_ACCERR) || @@ -310,20 +376,29 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext) exit(4); } - si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset); + si_pkey_ptr = siginfo_get_pkey_ptr(si); dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); dump_mem((u8 *)si_pkey_ptr - 8, 24); siginfo_pkey = *si_pkey_ptr; pkey_assert(siginfo_pkey < NR_PKEYS); last_si_pkey = siginfo_pkey; - dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr); - /* need __rdpkru() version so we do not do shadow_pkru checking */ - dprintf1("signal pkru from pkru: %08x\n", __rdpkru()); - dprintf1("pkey from siginfo: %jx\n", siginfo_pkey); - *(u64 *)pkru_ptr = 0x00000000; - dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n"); - pkru_faults++; + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; dprintf1("<<<<==================================================\n"); dprint_in_signal = 0; } @@ -391,143 +466,6 @@ pid_t fork_lazy_child(void) return forkret; } -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 pkru = __rdpkru(); - u32 shifted_pkru; - u32 masked_pkru; - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkru: %x\n", __func__, pkru); - - shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY)); - dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru); - masked_pkru = shifted_pkru & mask; - dprintf2("%s() masked pkru: %x\n", __func__, masked_pkru); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other high bits. - */ - return masked_pkru; -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u32 old_pkru = __rdpkru(); - u32 new_pkru; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* copy old pkru */ - new_pkru = old_pkru; - /* mask out bits from pkey in old value: */ - new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY)); - /* OR in new bits for pkey: */ - new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY)); - - __wrpkru(new_pkru); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n", - __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u32 orig_pkru = rdpkru(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /*pkru and flags have the same format */ - shadow_pkru |= flags << (pkey * 2); - dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - pkey_assert(rdpkru() > orig_pkru); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u32 orig_pkru = rdpkru(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - /* pkru and flags have the same format */ - shadow_pkru &= ~(flags << (pkey * 2)); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru()); - if (flags) - assert(rdpkru() > orig_pkru); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, unsigned long pkey) { @@ -561,33 +499,44 @@ int alloc_pkey(void) int ret; unsigned long init_val = 0x0; - dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n", - __LINE__, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); ret = sys_pkey_alloc(0, init_val); /* - * pkey_alloc() sets PKRU, so we need to reflect it in - * shadow_pkru: + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: */ - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); if (ret) { /* clear both the bits: */ - shadow_pkru &= ~(0x3 << (ret * 2)); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); /* * move the new state in from init_val - * (remember, we cheated and init_val == pkru format) + * (remember, we cheated and init_val == pkey_reg format) */ - shadow_pkru |= (init_val << (ret * 2)); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); } - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); - dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); /* for shadow checking: */ - rdpkru(); - dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", - __LINE__, ret, __rdpkru(), shadow_pkru); + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); return ret; } @@ -612,10 +561,10 @@ int alloc_random_pkey(void) int nr_alloced = 0; int random_index; memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + srand((unsigned int)time(NULL)); /* allocate every possible key and make a note of which ones we got */ max_nr_pkey_allocs = NR_PKEYS; - max_nr_pkey_allocs = 1; for (i = 0; i < max_nr_pkey_allocs; i++) { int new_pkey = alloc_pkey(); if (new_pkey < 0) @@ -638,8 +587,9 @@ int alloc_random_pkey(void) free_ret = sys_pkey_free(alloced_pkeys[i]); pkey_assert(!free_ret); } - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -657,11 +607,15 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, if (nr_iterations-- < 0) break; - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); } pkey_assert(pkey < NR_PKEYS); @@ -669,8 +623,9 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, ret, __rdpkru(), shadow_pkru); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); return ret; } @@ -752,7 +707,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) void *ptr; int ret; - rdpkru(); + read_pkey_reg(); dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, size, prot, pkey); pkey_assert(pkey < NR_PKEYS); @@ -761,7 +716,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); pkey_assert(!ret); record_pkey_malloc(ptr, size, prot); - rdpkru(); + read_pkey_reg(); dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); return ptr; @@ -798,12 +753,15 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) } int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" #define GET_NR_HUGE_PAGES 10 void setup_hugetlbfs(void) { int err; int fd; - char buf[] = "123"; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; if (geteuid() != 0) { fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); @@ -814,11 +772,16 @@ void setup_hugetlbfs(void) /* * Now go make sure that we got the pages and that they - * are 2M pages. Someone might have made 1G the default. + * are PMD-level pages. Someone might have made PUD-level + * pages the default. */ - fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY); + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); if (fd < 0) { - perror("opening sysfs 2M hugetlb config"); + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } @@ -826,13 +789,14 @@ void setup_hugetlbfs(void) err = read(fd, buf, sizeof(buf)-1); close(fd); if (err <= 0) { - perror("reading sysfs 2M hugetlb config"); + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); return; } if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n", - buf, GET_NR_HUGE_PAGES); + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); return; } @@ -886,6 +850,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, malloc_pkey_anon_huge, malloc_pkey_hugetlb /* can not do direct with the pkey_mprotect() API: @@ -924,14 +889,14 @@ void *malloc_pkey(long size, int prot, u16 pkey) return ret; } -int last_pkru_faults; +int last_pkey_faults; #define UNKNOWN_PKEY -2 -void expected_pk_fault(int pkey) +void expected_pkey_fault(int pkey) { - dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n", - __func__, last_pkru_faults, pkru_faults); + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkru_faults + 1 == pkru_faults); + pkey_assert(last_pkey_faults + 1 == pkey_faults); /* * For exec-only memory, we do not know the pkey in @@ -940,24 +905,28 @@ void expected_pk_fault(int pkey) if (pkey != UNKNOWN_PKEY) pkey_assert(last_si_pkey == pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ /* - * The signal handler shold have cleared out PKRU to let the + * The signal handler shold have cleared out PKEY register to let the * test program continue. We now have to restore it. */ - if (__rdpkru() != 0) + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ pkey_assert(0); - __wrpkru(shadow_pkru); - dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n", - __func__, shadow_pkru); - last_pkru_faults = pkru_faults; + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; last_si_pkey = -1; } -#define do_not_expect_pk_fault(msg) do { \ - if (last_pkru_faults != pkru_faults) \ - dprintf0("unexpected PK fault: %s\n", msg); \ - pkey_assert(last_pkru_faults == pkru_faults); \ +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ } while (0) int test_fds[10] = { -1 }; @@ -1000,6 +969,58 @@ __attribute__((noinline)) int read_ptr(int *ptr) return *ptr; } +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + void test_read_of_write_disabled_region(int *ptr, u16 pkey) { int ptr_contents; @@ -1015,26 +1036,67 @@ void test_read_of_access_disabled_region(int *ptr, u16 pkey) int ptr_contents; dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - rdpkru(); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); pkey_access_deny(pkey); ptr_contents = read_ptr(ptr); dprintf1("*ptr: %d\n", ptr_contents); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_write_of_write_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); pkey_write_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } void test_write_of_access_disabled_region(int *ptr, u16 pkey) { dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); pkey_access_deny(pkey); *ptr = __LINE__; - expected_pk_fault(pkey); + expected_pkey_fault(pkey); } + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) { int ret; @@ -1160,9 +1222,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) int new_pkey; dprintf1("%s() alloc loop: %d\n", __func__, i); new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__, - __LINE__, err, __rdpkru(), shadow_pkru); - rdpkru(); /* for shadow checking */ + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); if ((new_pkey == -1) && (errno == ENOSPC)) { dprintf2("%s() failed to allocate pkey after %d tries\n", @@ -1188,6 +1252,7 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) dprintf3("%s()::%d\n", __func__, __LINE__); /* + * On x86: * There are 16 pkeys supported in hardware. Three are * allocated by the time we get here: * 1. The default key (0) @@ -1195,13 +1260,21 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey) * 3. One allocated by the test code and passed in via * 'pkey' to this function. * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. */ - pkey_assert(i >= NR_PKEYS-3); + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); for (i = 0; i < nr_allocated_pkeys; i++) { err = sys_pkey_free(allocated_pkeys[i]); pkey_assert(!err); - rdpkru(); /* for shadow checking */ + read_pkey_reg(); /* for shadow checking */ } } @@ -1287,7 +1360,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect an exception: */ peek_result = read_ptr(ptr); - expected_pk_fault(pkey); + expected_pkey_fault(pkey); /* * Try to access the NON-pkey-protected "plain_ptr" via ptrace: @@ -1297,7 +1370,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey) pkey_assert(ret != -1); /* Now access from the current task, and expect NO exception: */ peek_result = read_ptr(plain_ptr); - do_not_expect_pk_fault("read plain pointer after ptrace"); + do_not_expect_pkey_fault("read plain pointer after ptrace"); ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); pkey_assert(ret != -1); @@ -1347,17 +1420,15 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey) pkey_assert(!ret); pkey_access_deny(pkey); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* * Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(pkey); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); } void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) @@ -1378,15 +1449,13 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); pkey_assert(!ret); - dprintf2("pkru: %x\n", rdpkru()); + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); /* Make sure this is an *instruction* fault */ madvise(p1, PAGE_SIZE, MADV_DONTNEED); lots_o_noops_around_write(&scratch); - do_not_expect_pk_fault("executing on PROT_EXEC memory"); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pk_fault(UNKNOWN_PKEY); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); /* * Put the memory back to non-PROT_EXEC. Should clear the @@ -1400,7 +1469,7 @@ void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); pkey_assert(!ret); ptr_contents = read_ptr(p1); - do_not_expect_pk_fault("plain read on recently PROT_EXEC area"); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); } void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) @@ -1408,7 +1477,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) int size = PAGE_SIZE; int sret; - if (cpu_has_pku()) { + if (cpu_has_pkeys()) { dprintf1("SKIP: %s: no CPU support\n", __func__); return; } @@ -1420,8 +1489,11 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) void (*pkey_tests[])(int *ptr, u16 pkey) = { test_read_of_write_disabled_region, test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, test_kernel_write_of_access_disabled_region, test_kernel_write_of_write_disabled_region, test_kernel_gup_of_access_disabled_region, @@ -1433,6 +1505,7 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = { test_pkey_syscalls_on_non_allocated_pkey, test_pkey_syscalls_bad_args, test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, }; void run_tests_once(void) @@ -1442,7 +1515,7 @@ void run_tests_once(void) for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { int pkey; - int orig_pkru_faults = pkru_faults; + int orig_pkey_faults = pkey_faults; dprintf1("======================\n"); dprintf1("test %d preparing...\n", test_nr); @@ -1457,8 +1530,8 @@ void run_tests_once(void) free_pkey_malloc(ptr); sys_pkey_free(pkey); - dprintf1("pkru_faults: %d\n", pkru_faults); - dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults); + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); tracing_off(); close_test_fds(); @@ -1471,18 +1544,19 @@ void run_tests_once(void) void pkey_setup_shadow(void) { - shadow_pkru = __rdpkru(); + shadow_pkey_reg = __read_pkey_reg(); } int main(void) { int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); setup_handlers(); - printf("has pku: %d\n", cpu_has_pku()); + printf("has pkeys: %d\n", pkeys_supported); - if (!cpu_has_pku()) { + if (!pkeys_supported) { int size = PAGE_SIZE; int *ptr; @@ -1495,7 +1569,7 @@ int main(void) } pkey_setup_shadow(); - printf("startup pkru: %x\n", rdpkru()); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); setup_hugetlbfs(); while (nr_iterations-- > 0) diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore index 022a1f3b64ef..1aaef5bf119a 100644 --- a/tools/testing/selftests/x86/.gitignore +++ b/tools/testing/selftests/x86/.gitignore @@ -12,5 +12,4 @@ ldt_gdt iopl mpx-mini-test ioperm -protection_keys test_vdso diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 5d49bfec1e9a..5f16821c7f63 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -12,7 +12,7 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie) TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \ check_initial_reg_state sigreturn iopl ioperm \ - protection_keys test_vdso test_vsyscall mov_ss_trap \ + test_vdso test_vsyscall mov_ss_trap \ syscall_arg_fault TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/pkey-helpers.h b/tools/testing/selftests/x86/pkey-helpers.h deleted file mode 100644 index 254e5436bdd9..000000000000 --- a/tools/testing/selftests/x86/pkey-helpers.h +++ /dev/null @@ -1,219 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include <string.h> -#include <stdarg.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> - -#define NR_PKEYS 16 -#define PKRU_BITS_PER_PKEY 2 - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern unsigned int shadow_pkru; -static inline unsigned int __rdpkru(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned int pkru; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkru = eax; - return pkru; -} - -static inline unsigned int _rdpkru(int line) -{ - unsigned int pkru = __rdpkru(); - - dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n", - line, pkru, shadow_pkru); - assert(pkru == shadow_pkru); - - return pkru; -} - -#define rdpkru() _rdpkru(__LINE__) - -static inline void __wrpkru(unsigned int pkru) -{ - unsigned int eax = pkru; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkru == __rdpkru()); -} - -static inline void wrpkru(unsigned int pkru) -{ - dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru); - /* will do the shadow check for us: */ - rdpkru(); - __wrpkru(pkru); - shadow_pkru = pkru; - dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru()); -} - -/* - * These are technically racy. since something could - * change PKRU between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - unsigned int pkru = rdpkru(); - int bit = pkey * 2; - - if (do_allow) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - dprintf4("pkru now: %08x\n", rdpkru()); - wrpkru(pkru); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - long pkru = rdpkru(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkru &= (1<<bit); - else - pkru |= (1<<bit); - - wrpkru(pkru); - dprintf4("pkru now: %08x\n", rdpkru()); -} - -#define PROT_PKEY0 0x10 /* protection key value (bit 0) */ -#define PROT_PKEY1 0x20 /* protection key value (bit 1) */ -#define PROT_PKEY2 0x40 /* protection key value (bit 2) */ -#define PROT_PKEY3 0x80 /* protection key value (bit 3) */ - -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - /* ecx is often an input as well as an output. */ - asm volatile( - "cpuid;" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pku(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - eax = 0x7; - ecx = 0x0; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -#define XSTATE_PKRU_BIT (9) -#define XSTATE_PKRU 0x200 - -int pkru_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKRU is set in XCR0 */ - leaf = XSTATE_PKRU_BIT; - { - eax = XSTATE_CPUID; - ecx = leaf; - __cpuid(&eax, &ebx, &ecx, &edx); - - if (leaf == XSTATE_PKRU_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKRU in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -#endif /* _PKEYS_HELPER_H */ |