Spring IOC容器之Bean管理

发表于 2022-01-20 分类于学习笔记， Spring

Spring IOC容器之Bean管理

1 基于xml方式

在Spring配置文件中，在bean标签里面添加对应的属性，就可以实现对象的创建。

1.1 创建对象

配置文件，

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xmlns:p="http://www.springframework.org/schema/p"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">

    <bean id="user" class="com.company.User"></bean>
</beans>

读取配置文件，工厂方法创建对象，

public class Test {
    public static void main(String[] args) {
        ApplicationContext context = new ClassPathXmlApplicationContext("bean1.xml");
        User user = context.getBean("user", User.class);
        System.out.println(user);
    }
}

1.2 注入属性

1.2.1 set方法

先要定义属性的set方法，

public void setName(String name) {
    this.name = name;
}

public void setAddress(String address) {
    this.address = address;
}

配置文件中在bean标签下使用property标签设置属性，

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">

    <bean id="user" class="com.company.User">
        <property name="name" value="Xiaohua"></property>
        <property name="address" value="Shanghai"></property>
    </bean>
</beans>

简化方式：p名字空间注入（不常用），

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xmlns:p="http://www.springframework.org/schema/p"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">

    <bean id="user" class="com.company.User" p:name="Xiaohua" p:address="Shanghai"></bean>
</beans>

使用null标签注入空值，

1
2
3

<property name="telephone">
    <null></null>
</property>

使用xml CDATA注入特殊字符，

1
2
3

<property name="favoriteBook">
    <value><![CDATA[《你好，明天！》]]></value>
</property>

1.2.2 有参构造函数

先要定义类的有参构造函数，

public User(String name, String address) {
    this.name = name;
    this.address = address;
}

配置文件中在bean标签下使用constructor-arg标签初始化对象，

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">

    <bean id="user" class="com.company.User">
        <constructor-arg name="name" value="Lihua"></constructor-arg>
        <constructor-arg name="address" value="Beijing"></constructor-arg>
    </bean>
</beans>

1.2.3 外部bean

通常一个类会包含对象属性，

public class UserService {
    private UserDao userDao;

    public void setUserDao(UserDao userDao) {
        this.userDao = userDao;
    }

    public void update() {
        System.out.println("UserService update......");
        userDao.update();
    }
}

例如上面的代码，UserService类中就包含了一个UserDao对象

如果需要初始化类内的对象属性，可以使用ref以外部bean的形式配置，

<bean id="userDao" class="com.company.dao.UserDaoImpl"></bean>
<bean id="userService" class="com.company.service.UserService">
    <property name="userDao" ref="userDao"></property>
</bean>

1.2.4 内部bean

也可以在property标签内部初始化对象属性，这种方式被称为内部bean，

<bean id="userService" class="com.company.service.UserService">
    <property name="userDao">
        <bean id="userDao" class="com.company.dao.UserDaoImpl"></bean>
    </property>
</bean>

1.2.5 级联赋值

<bean id="userService" class="com.company.service.UserService">
    <property name="userDao">
        <bean id="userDao" class="com.company.dao.UserDaoImpl"></bean>
    </property>
    <property name="userDao.name" value="Lihua"></property>
</bean>

需要提前定义好get和set方法，否则会报错
先在UserService对象中创建UserDao对象，然后调用get方法获得UserDao对象，最后调用setName方法为name属性赋值

1.2.6 集合属性

提前定义好属性的set方法，

public class Student {
    private String[] courses;
    private List<String> nicknames;
    private Map<String, String> contactMethod;
    private Set<String> friends;

    public void setCourses(String[] courses) {
        this.courses = courses;
    }

    public void setNicknames(List<String> nicknames) {
        this.nicknames = nicknames;
    }

    public void setContactMethod(Map<String, String> contactMethod) {
        this.contactMethod = contactMethod;
    }

    public void setFriends(Set<String> friends) {
        this.friends = friends;
    }

    public void show() {
        System.out.println(Arrays.toString(courses));
        System.out.println(nicknames);
        System.out.println(contactMethod);
        System.out.println(friends);
    }
}

1.2.6.1 数组

<property name="courses">
    <array>
        <value>Java程序设计</value>
        <value>汇编语言</value>
        <value>操作系统原理</value>
    </array>
</property>

1.2.6.2 List集合

<property name="nicknames">
    <list>
        <value>Xiaoming</value>
        <value>Mingming</value>
    </list>
</property>

1.2.6.3 Map集合

<property name="contactMethod">
    <map>
        <entry key="tele" value="123xxxxxxxx"></entry>
        <entry key="qq" value="234xxxxx"></entry>
    </map>
</property>

1.2.6.4 Set集合

<property name="friends">
    <set>
        <value>Lihua</value>
        <value>Xiaohong</value>
    </set>
</property>

1.2.6.5 在集合里面设置对象类型值

有时集合属性中存放的是对象，

public class BookShelf {
    private List<Book> books;

    public void setBooks(List<Book> books) {
        this.books = books;
    }

    public void show() {
        System.out.println(books);
    }
}

使用ref标签初始化集合属性内部存放的对象，

<bean id="book1" class="com.company.Book">
    <property name="name" value="面向对象程序设计"></property>
</bean>
<bean id="book2" class="com.company.Book">
    <property name="name" value="数据库原理"></property>
</bean>

<bean id="bookShelf" class="com.company.BookShelf">
    <property name="books">
        <list>
            <ref bean="book1"></ref>
            <ref bean="book2"></ref>
        </list>
    </property>
</bean>

1.2.6.6 把集合注入部分提取出来

修改xml文件头部的配置，

<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xmlns:util="http://www.springframework.org/schema/util"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
                        http://www.springframework.org/schema/util http://www.springframework.org/schema/util/spring-util.xsd">

使用util标签将集合注入部分提取出来，

<bean id="book1" class="com.company.Book">
    <property name="name" value="面向对象程序设计"></property>
</bean>
<bean id="book2" class="com.company.Book">
    <property name="name" value="数据库原理"></property>
</bean>
<bean id="book3" class="com.company.Book">
    <property name="name" value="计算机网络"></property>
</bean>

<util:list id="books">
    <ref bean="book1"></ref>
    <ref bean="book2"></ref>
    <ref bean="book3"></ref>
</util:list>

引用被抽取出的集合，

1
2
3

<bean id="bookShelf" class="com.company.BookShelf">
    <property name="books" ref="books"></property>
</bean>

1.3 工厂bean

对于工厂bean，配置文件中定义的bean类型可以和返回类型不一样
第一步：创建类，让这个类实现接口FactoryBean，作为工厂bean
第二步：实现接口声明的方法，在实现的方法中定义返回的对象类型，

public class FacBean implements FactoryBean<Book> {
    @Override
    public Book getObject() throws Exception {
        Book book = new Book();
        book.setName("C++ Primer");
        return book;
    }

    @Override
    public Class<?> getObjectType() {
        return Book.class;
    }

    @Override
    public boolean isSingleton() {
        return false;
    }
}

配置文件，

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd">
    <bean id="book" class="com.company.factorybean.FacBean"></bean>
</beans>

bean标签定义的类型为FacBean，但是实际getBean方法返回的对象类型是Book

1.4 bean的作用域

在Spring里面，默认情况下，bean是单实例对象
bean标签scope属性的取值，

	是否单例	对象创建的时机
singleton	是	Spring加载配置文件时
prototype	否	调用getBean方法时

1.5 bean的生命周期

1.5.1 过程

通过构造器创建bean实例
调用set方法为bean的属性设置值和对其他bean引用
将bean对象传递给bean后置处理器的postProcessBeforeInitialization方法
调用bean的初始化方法（需要配置初始化方法）
将bean对象传递给bean后置处理器的postProcessAfterInitialization方法
bean可以被使用（对象获取到了）
当容器关闭时，调用bean的销毁方法（需要配置销毁的方法）

1.5.2 配置bean的初始化方法

为类定义一个方法，作为bean的初始化方法，

1
2
3

public void initMethod() {
    System.out.println("step 4: call init method");
}

在bean标签中使用init-method属性配置bean的初始化方法，

1
2
3

<bean id="order" class="com.company.cycle.Order" init-method="initMethod" destroy-method="destroyMethod">
    <property name="name" value="laptop"></property>
</bean>

1.5.3 配置bean的销毁方法

为类定义一个方法，作为bean的销毁方法，

1
2
3

public void destroyMethod() {
    System.out.println("step 7: call destroy method");
}

在bean标签中使用destroy-method属性配置bean的销毁方法，

1
2
3

<bean id="order" class="com.company.cycle.Order" init-method="initMethod" destroy-method="destroyMethod">
    <property name="name" value="laptop"></property>
</bean>

destroyMethod在容器关闭的时候被调用，

1	((ClassPathXmlApplicationContext)context).close();

1.6 xml自动装配

假设存在Employee类和Department类，

public class Employee {
    private String name;
    private Department dept;

    public void setName(String name) {
        this.name = name;
    }

    public void setDept(Department dept) {
        this.dept = dept;
    }

    @Override
    public String toString() {
        return "Employee{" +
                "name='" + name + '\'' +
                ", dept=" + dept +
                '}';
    }
}

public class Department {
    private String name;

    public void setName(String name) {
        this.name = name;
    }

    @Override
    public String toString() {
        return "Department{" +
                "name='" + name + '\'' +
                '}';
    }
}

1.6.1 按名字

在bean标签中将autowire属性配置为byName，

<bean id="employee" class="com.company.Employee" autowire="byName">
    <property name="name" value="Lihua"></property>
</bean>
<bean id="dept" class="com.company.Department">
    <property name="name" value="Tech"></property>
</bean>

1.6.2 按类型

bean标签中将autowire属性配置为byType，

<bean id="employee" class="com.company.Employee" autowire="byType">
    <property name="name" value="Lihua"></property>
</bean>
<bean id="dept" class="com.company.Department">
    <property name="name" value="Tech"></property>
</bean>

1.7 外部属性文件

使用“user.properties”配置文件初始化User对象，

user.name="Lihua"
user.address="Beijing"
user.telephone="123xxxxxxxx"
user.favoriteBook="Hello, world!"

在xml配置文件头部添加context名字空间，

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xmlns:context="http://www.springframework.org/schema/context"
       xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
                        http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd">

引用外部配置文件注入属性，

<context:property-placeholder location="classpath:user.properties"></context:property-placeholder>

<bean id="user" class="com.company.User">
    <property name="name" value="${user.name}"></property>
    <property name="address" value="${user.address}"></property>
    <property name="telephone" value="${user.telephone}"></property>
    <property name="favoriteBook" value="${user.favoriteBook}"></property>
</bean>

使用“$”符号引用外部配置文件的内容，必须保证大括号内和配置文件中的key一致

2 基于注解方式

2.1 Spring创建对象的注解

@Component，通用注解
@Service，通常用在Service层
@Controller，通常用在Web层
@Repository，通常用在DAO层

以上四个注解功能是一样的，都可以用来创建bean实例

2.2 创建对象

2.2.1 开启组件扫描

可以使用逗号隔开多个包，

1	<context:component-scan base-package="com.company.dao, com.company.service"></context:component-scan>

也可以将上层的包作为参数（底层的包都会被扫描），

1	<context:component-scan base-package="com.company"></context:component-scan>

2.2.2 添加注解

@Repository(value = "userDao")
public class UserDao {
    @Override
    public String toString() {
        return "UserDao{}";
    }
}

@Service(value = "userService")
public class UserService {
    UserDao userDao;

    public void setUserDao(UserDao userDao) {
        this.userDao = userDao;
    }

    @Override
    public String toString() {
        return "UserService{" +
                "userDao=" + userDao +
                '}';
    }
}

注解括号内的value相当于xml配置文件bean标签的id属性
注解的value值可以不指定，默认值是将类名首字母小写

2.3 组件扫描配置

2.3.1 只扫描指定的注解

<context:component-scan base-package="com.company" use-default-filters="false">
    <context:include-filter type="annotation" expression="org.springframework.stereotype.Service"/>
    <context:include-filter type="annotation" expression="org.springframework.stereotype.Repository"/>
</context:component-scan>

默认情况下，Spring将扫描base-package包下所有的类
如果配置use-default-filters为false，那么Spring将根据配置进行组件扫描
使用context:include-filter标签可以指定将被扫描的注解

2.3.2 不扫描指定的注解

<context:component-scan base-package="com.company">
    <context:exclude-filter type="annotation" expression="org.springframework.stereotype.Component"/>
    <context:exclude-filter type="annotation" expression="org.springframework.stereotype.Controller"/>
</context:component-scan>

使用context:exclude-filter标签可以指定某些注解不被扫描

2.4 注入属性

2.4.1 Autowired

@Autowired注解根据属性的类型自动装配，

@Repository
public class UserDaoImpl implements UserDao{
    @Override
    public void update() {
        System.out.println("UserDao update......");
    }
}

@Service(value = "userService")
public class UserService {
    @Autowired
    UserDao userDao;

    public void setUserDao(UserDao userDao) {
        this.userDao = userDao;
    }

    @Override
    public String toString() {
        return "UserService{" +
                "userDao=" + userDao +
                '}';
    }
}

在userDao属性上注解@Autowired，Spring将在组件扫描的包内搜寻符合条件的类型创建对象。在这个例子中，Spring找到UserDaoImpl是UserDao的实现类，符合条件

2.4.2 Qualifier

在上面的例子中，如果UserDao接口有多个实现类，仅通过@Autowired注解自动装配会产生冲突，因为Spring不知道该创建哪个实现类的对象
这个时候可以使用@Qualifier注解指定实现类，

@Repository
public class UserDaoImpl2 implements UserDao{
    @Override
    public void update() {
        System.out.println("UserDao update......");
    }
}

@Service(value = "userService")
public class UserService {
    @Autowired
    @Qualifier(value = "userDaoImpl2")
    UserDao userDao;

    public void setUserDao(UserDao userDao) {
        this.userDao = userDao;
    }

    @Override
    public String toString() {
        return "UserService{" +
                "userDao=" + userDao +
                '}';
    }
}

@Qualifier注解括号内的value必须和实现类注解括号内的value相同

2.4.3 Value

使用@Value注解可以注入非对象类型的属性，

@Repository
public class UserDaoImpl1 implements UserDao{
    @Value(value = "Lihua")
    private String name;

    @Override
    public void update() {
        System.out.println("UserDao update......");
    }

    @Override
    public String toString() {
        return "UserDaoImpl1{" +
                "name='" + name + '\'' +
                '}';
    }
}

2.5 完全注解开发

2.5.1 配置类

创建配置类，替代xml配置文件，

1
2
3

@Configurable
@ComponentScan(basePackages = "com.company")
public class SpringConfig {}

2.5.2 加载配置类，创建对象

1
2
3

ApplicationContext context = new AnnotationConfigApplicationContext(SpringConfig.class);
UserService service = context.getBean("userService", UserService.class);
System.out.println(service);

只需要将context的实现类替换成AnnotationConfigApplicationContext，将配置类的类型作为参数即可

MapperX 性能测试

发表于 2021-12-06 更新于 2021-12-13 分类于项目实战， MapperX

MapperX 性能测试

1 搭建 dm-cache 环境

1.1 创建物理卷

1.1.1 修改虚拟机设置，添加两块虚拟磁盘

1.1.2 对新建的磁盘进行分区及格式化的工作（以其中一块为例）

执行 fdisk /dev/sdb，

$ fdisk /dev/sdb
Welcome to fdisk (util-linux 2.23.2).

Changes will remain in memory only, until you decide to write them.
Be careful before using the write command.

Device does not contain a recognized partition table
Building a new DOS disklabel with disk identifier 0x6f3b5f16.

Command (m for help):

在上一步骤的基础上输入：m，

Command (m for help): m
Command action
   a   toggle a bootable flag
   b   edit bsd disklabel
   c   toggle the dos compatibility flag
   d   delete a partition
   g   create a new empty GPT partition table
   G   create an IRIX (SGI) partition table
   l   list known partition types
   m   print this menu
   n   add a new partition
   o   create a new empty DOS partition table
   p   print the partition table
   q   quit without saving changes
   s   create a new empty Sun disklabel
   t   change a partition's system id
   u   change display/entry units
   v   verify the partition table
   w   write table to disk and exit
   x   extra functionality (experts only)

然后根据提示输入：n，以添加新的分区，

Command (m for help): n
Partition type:
   p   primary (0 primary, 0 extended, 4 free)
   e   extended
Select (default p):

依次输入 p 和 1，

Partition type:
   p   primary (0 primary, 0 extended, 4 free)
   e   extended
Select (default p): p
Partition number (1-4, default 1): 1

接着便会提示卷的起始扇区和结束扇区，都保持默认按回车的即可（意思是只分一个区），

First sector (2048-4194303, default 2048):
Using default value 2048
Last sector, +sectors or +size{K,M,G} (2048-4194303, default 4194303):
Using default value 4194303
Partition 1 of type Linux and of size 2 GiB is set

Command (m for help):

输入 “w” 保存并退出，

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.
Syncing disks.

再次使用 “fdisk -l ”这个命令来查看会发现出现了/dev/sdb1（说明已经完成了分区工作）,

$ fdisk -l
............
   Device Boot      Start         End      Blocks   Id  System
/dev/sdb1            2048     4194303     2096128   83  Linux

Disk /dev/mapper/centos-root: 18.2 GB, 18249416704 bytes, 35643392 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
............

1.1.3 初始化分区为物理卷

使用 pvcreate 命令新建物理卷，

1 2	$ pvcreate /dev/sdb1 Physical volume "/dev/sdb1" successfully created.

查看当前的物理卷，

$ pvdisplay
  --- Physical volume ---
  PV Name               /dev/sda2
  VG Name               centos
  PV Size               <19.00 GiB / not usable 3.00 MiB
  Allocatable           yes (but full)
  PE Size               4.00 MiB
  Total PE              4863
  Free PE               0
  Allocated PE          4863
  PV UUID               Y0aMSP-YKlD-Y2yH-c01s-4FME-4loY-GqqDVw

  "/dev/sdb1" is a new physical volume of "<2.00 GiB"
  --- NEW Physical volume ---
  PV Name               /dev/sdb1
  VG Name
  PV Size               <2.00 GiB
  Allocatable           NO
  PE Size               0
  Total PE              0
  Free PE               0
  Allocated PE          0
  PV UUID               i3qKtd-WqIU-aIxY-0niW-OwLd-GFSC-HjKXu2

物理卷 /dev/sdb1 出现在列表中

1.2 将物理卷添加到卷组

使用 vgextend 命令将物理卷添加到卷组，

1 2	$ vgextend centos /dev/sdb1 Volume group "centos" successfully extended

1.3 在卷组上创建逻辑卷

使用 lvcreate 命令创建逻辑卷，

1 2	$ lvcreate -n dm_cache -L 1G centos /dev/sdb1 Logical volume "dm_cache" created.

1.4 将 fast LV 设置成 main LV 的高速缓存

使用 lvconvert 命令建立高速缓存磁盘和大容量磁盘的关联，

$ lvconvert --type cache --cachepool dm_cache centos/dm_main
  WARNING: Converting centos/dm_cache to cache pool's data volume with metadata wiping.
  THIS WILL DESTROY CONTENT OF LOGICAL VOLUME (filesystem etc.)
Do you really want to convert centos/dm_cache? [y/n]: y
  Converted centos/dm_cache to cache pool.
  Logical volume centos/dm_main is now cached.

查看当前的逻辑卷，

$ lvdisplay
  --- Logical volume ---
  LV Path                /dev/centos/dm_main
  LV Name                dm_main
  VG Name                centos
  LV UUID                FsTspi-GPTM-bCXd-oLZ6-4hdo-OUJo-LErddE
  LV Write Access        read/write
  LV Creation host, time localhost.localdomain, 2021-12-12 02:21:08 -0500
  LV Cache pool name     dm_cache
  LV Cache origin name   dm_main_corig
  LV Status              available
  # open                 0
  LV Size                19.90 GiB
  Cache used blocks      0.05%
  Cache metadata blocks  2.15%
  Cache dirty blocks     0.00%
  Cache read hits/misses 5 / 44
  Cache wrt hits/misses  0 / 0
  Cache demotions        0
  Cache promotions       9
  Current LE             5095
  Segments               1
  Allocation             inherit
  Read ahead sectors     auto
  - currently set to     8192
  Block device           253:2

2 安装性能测试工具

2.1 安装 fio

1	$ yum install -y fio

fio 是一个多线程 I/O 生成工具,可以生成多种 I/O 模式,用来测试磁盘设备的性能(也包含文件系统:如针对网络文件系统 NFS 的 I/O 测试)。

2. 下载 MSR Cambridge Traces

点击链接 http://iotta.snia.org/traces/block-io/388，进入下载页面，

3 下载、编译、安装 MapperX

根据 MapperX README.md 的提示步骤，下载、编译、安装 MapperX
编译内核，

1	$ make CONFIG_DM_CACHE=m -j4

编译指定模块，

1	$ make CONFIG_DM_CACHE=m modules SUBDIRS=drivers/md

安装模块，

1	$ sudo make modules_install

安装内核，

1	$ sudo make install

重启虚拟机，

$ reboot

4 Benchmark

4.1 Random Write Latency

4.1.1 sync

dm-cache 默认的写策略是 write through，确保每次写到高速缓存的数据都能同步写到大容量低速设备上，不必担心数据丢失的风险，

1 2	$ dmsetup status centos/dm_main 0 41738240 cache 8 41/2048 128 16384/16384 1690 123 1718562 23599 94 16478 0 3 metadata2 writethrough no_discard_passdown 2 migration_threshold 2048 smq 0 rw -

查看高速缓存的块大小，

1
2
3

$ lvs -o+chunksize centos/dm_main
  LV      VG     Attr       LSize  Pool       Origin          Data%  Meta%  Move Log Cpy%Sync Convert Chunk
  dm_main centos Cwi-a-C--- 19.90g [dm_cache] [dm_main_corig] 0.05   2.15            0.00             64.00k

默认是 64 KB

运行 fio 开始测试，

1	$ fio -filename=/dev/centos/dm_main -direct=1 -iodepth=1 -thread -rw=randwrite -ioengine=psync -bs=64k -size=100M -numjobs=50 -group_reporting -name=rand_100write_64k_clat > rand_100write_64k_clat

实验数据，

$ cat rand_100write_64k_clat
rand_100write_64k_clat: (g=0): rw=randwrite, bs=(R) 64.0KiB-64.0KiB, (W) 64.0KiB-64.0KiB, (T) 64.0KiB-64.0KiB, ioengine=psync, iodepth=1
...
fio-3.7
Starting 50 threads

rand_100write_64k_clat: (groupid=0, jobs=50): err= 0: pid=3135: Sun Dec 12 07:50:37 2021
  write: IOPS=5154, BW=322MiB/s (338MB/s)(5000MiB/15519msec)
    clat (usec): min=150, max=209870, avg=9572.78, stdev=5985.26
     lat (usec): min=151, max=209874, avg=9575.53, stdev=5985.52
    clat percentiles (usec):
     |  1.00th=[   529],  5.00th=[  1647], 10.00th=[  6849], 20.00th=[  7570],
     | 30.00th=[  8029], 40.00th=[  8455], 50.00th=[  8979], 60.00th=[ 10159],
     | 70.00th=[ 10814], 80.00th=[ 11469], 90.00th=[ 12387], 95.00th=[ 13435],
     | 99.00th=[ 27132], 99.50th=[ 44827], 99.90th=[ 84411], 99.95th=[115868],
     | 99.99th=[154141]
   bw (  KiB/s): min= 1792, max=14336, per=2.00%, avg=6589.61, stdev=1289.25, samples=1497
   iops        : min=   28, max=  224, avg=102.72, stdev=20.18, samples=1497
  lat (usec)   : 250=0.04%, 500=0.80%, 750=1.44%, 1000=1.10%
  lat (msec)   : 2=2.26%, 4=1.90%, 10=50.81%, 20=40.11%, 50=1.15%
  lat (msec)   : 100=0.32%, 250=0.07%
  cpu          : usr=0.16%, sys=1.47%, ctx=79970, majf=0, minf=31
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=0,80000,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
  WRITE: bw=322MiB/s (338MB/s), 322MiB/s-322MiB/s (338MB/s-338MB/s), io=5000MiB (5243MB), run=15519-15519msec

clat（Completion Latency）p50为8979，avg为9572.78，p99.9为84411

重新运行 fio，其他参数不变，设置 iodepth=16，测试 IOPS，

1 2	write: IOPS=5111, BW=319MiB/s (335MB/s)(5000MiB/15652msec) iops : min= 42, max= 271, avg=102.47, stdev=23.19, samples=1492

将 dm-cache 的块大小更改成 128KB，

$ lvconvert --splitcache centos/dm_main
  Logical volume centos/dm_main is not cached and cache pool centos/dm_cache is unused.
$ lvconvert --type cache --cachepool dm_cache centos/dm_main --chunksize 128
Do you want wipe existing metadata of cache pool centos/dm_cache? [y/n]: y
  Logical volume centos/dm_main is now cached.

重复上面的实验，先测 clat，

$ fio -filename=/dev/centos/dm_main -direct=1 -iodepth=1 -thread -rw=randwrite -ioengine=psync -bs=128k -size=100M -numjobs=50 -group_reporting -name=rand_100write_128k_clat > rand_100write_128k_clat
 clat (usec): min=294, max=286567, avg=16790.77, stdev=11746.64
 
 clat percentiles (usec):
     |  1.00th=[  1532],  5.00th=[  4146], 10.00th=[  7373], 20.00th=[ 13566],
     | 30.00th=[ 15401], 40.00th=[ 16909], 50.00th=[ 17695], 60.00th=[ 18220],
     | 70.00th=[ 18744], 80.00th=[ 19268], 90.00th=[ 20317], 95.00th=[ 21365],
     | 99.00th=[ 32375], 99.50th=[ 65274], 99.90th=[263193], 99.95th=[270533],
     | 99.99th=[283116]

然后再测 iops，

1
2
3

$ fio -filename=/dev/centos/dm_main -direct=1 -iodepth=16 -thread -rw=randwrite -ioengine=psync -bs=128k -size=100M -numjobs=50 -group_reporting -name=rand_100write_128k_iops > rand_100write_128k_iops
write: IOPS=3133, BW=392MiB/s (411MB/s)(5000MiB/12764msec)
iops        : min=   39, max=   88, avg=62.72, stdev= 9.68, samples=1215

将 dm-cache 的块大小更改成 256KB，

$ lvconvert --splitcache centos/dm_main
  Logical volume centos/dm_main is not cached and cache pool centos/dm_cache is unused.
$ lvconvert --type cache --cachepool dm_cache centos/dm_main --chunksize 256
Do you want wipe existing metadata of cache pool centos/dm_cache? [y/n]: y
  Logical volume centos/dm_main is now cached.

测试 clat，

$ fio -filename=/dev/centos/dm_main -direct=1 -iodepth=1 -thread -rw=randwrite -ioengine=psync -bs=256k -size=100M -numjobs=50 -group_reporting -name=rand_100write_256k_clat > rand_100write_256k_clat
 clat (usec): min=349, max=266827, avg=26564.90, stdev=14122.46
 
 clat percentiles (usec):
     |  1.00th=[  1696],  5.00th=[  6980], 10.00th=[ 21627], 20.00th=[ 24511],
     | 30.00th=[ 25297], 40.00th=[ 25822], 50.00th=[ 26346], 60.00th=[ 26870],
     | 70.00th=[ 27395], 80.00th=[ 28181], 90.00th=[ 30016], 95.00th=[ 33162],
     | 99.00th=[ 80217], 99.50th=[125305], 99.90th=[221250], 99.95th=[231736],
     | 99.99th=[258999]

测试 iops，

1
2
3

$ fio -filename=/dev/centos/dm_main -direct=1 -iodepth=16 -thread -rw=randwrite -ioengine=psync -bs=256k -size=100M -numjobs=50 -group_reporting -name=rand_100write_256k_iops > rand_100write_256k_iops
write: IOPS=1678, BW=420MiB/s (440MB/s)(5000MiB/11916msec)
iops        : min=   13, max=   75, avg=33.92, stdev= 6.59, samples=1107

4.1.2 normal

将 dm-cache 的 write mode 更改为 writeback，

$ lvconvert --splitcache centos/dm_main
  Logical volume centos/dm_main is not cached and cache pool centos/dm_cache is unused.
$ lvconvert --type cache --cachepool dm_cache centos/dm_main --chunksize 64 --cachemode writeback
Do you want wipe existing metadata of cache pool centos/dm_cache? [y/n]: y
  Logical volume centos/dm_main is now cached.

重复上面的实验，

当 dm-cache 块大小为 64KB 时，clat 为，

clat (usec): min=122, max=254254, avg=3028.95, stdev=8495.91
  
clat percentiles (usec):
 |  1.00th=[   293],  5.00th=[   498], 10.00th=[   717], 20.00th=[  1106],
 | 30.00th=[  1450], 40.00th=[  1778], 50.00th=[  2114], 60.00th=[  2442],
 | 70.00th=[  2868], 80.00th=[  3556], 90.00th=[  4686], 95.00th=[  5604],
 | 99.00th=[ 11076], 99.50th=[ 58459], 99.90th=[147850], 99.95th=[183501],
 | 99.99th=[235930]

iops 为，

1 2	write: IOPS=14.8k, BW=925MiB/s (970MB/s)(5000MiB/5405msec) iops : min= 100, max= 782, avg=322.91, stdev=88.47, samples=435

当 dm-cache 块大小为 128KB 时，clat 为，

clat (usec): min=117, max=502708, avg=4851.00, stdev=8269.09

clat percentiles (usec):
 |  1.00th=[   502],  5.00th=[   898], 10.00th=[  1401], 20.00th=[  2278],
 | 30.00th=[  2933], 40.00th=[  3458], 50.00th=[  3949], 60.00th=[  4490],
 | 70.00th=[  5080], 80.00th=[  5932], 90.00th=[  8029], 95.00th=[  9634],
 | 99.00th=[ 18482], 99.50th=[ 43254], 99.90th=[128451], 99.95th=[152044],
 | 99.99th=[308282]

iops 为，

1 2	write: IOPS=9541, BW=1193MiB/s (1251MB/s)(5000MiB/4192msec) iops : min= 56, max= 503, avg=202.64, stdev=75.92, samples=337

当 dm-cache 块大小为 256KB 时，clat 为，

clat (usec): min=205, max=310102, avg=8785.39, stdev=9813.20
 
clat percentiles (usec):
 |  1.00th=[   668],  5.00th=[  1172], 10.00th=[  2278], 20.00th=[  4752],
 | 30.00th=[  5932], 40.00th=[  6849], 50.00th=[  7635], 60.00th=[  8455],
 | 70.00th=[  9503], 80.00th=[ 11207], 90.00th=[ 14222], 95.00th=[ 17695],
 | 99.00th=[ 31065], 99.50th=[ 42730], 99.90th=[149947], 99.95th=[185598],
 | 99.99th=[278922]

iops 为，

1 2	write: IOPS=5323, BW=1331MiB/s (1395MB/s)(5000MiB/3757msec) iops : min= 18, max= 233, avg=119.38, stdev=31.13, samples=278

TODO

Benchmark
实验结果分析

linux 内核内存分配函数

发表于 2021-11-28 分类于学习笔记， linux内核

linux 内核内存分配函数

1 伙伴系统

1.1 alloc_page/alloc_pages/free_pages

1.1.1 alloc_pages_node

1	struct page *alloc_pages_node(int nid, unsigned int flags, unsigned int order);

参数	含义
nid	NUMA node ID
flags	Usual GFP_ allocation flags
order	he size of the allocation

现在的机器上都是有多个CPU和多个内存块的，以前我们都是将内存块看成是一大块内存，所有CPU到这个共享内存访问消息是一样的，这就是之前普遍使用的SMP模型
但是随着处理器的增加，共享内存可能会导致内存访问冲突越来越厉害，NUMA（Non-Uniform Memory Access）就是在这样的环境下引入的一个模型，比如一台机器有2个处理器、4个内存块，我们将1个处理器和两个内存块关联起来，称为一个 NUMA node，这样这个机器就会有两个 NUMA node，在物理分布上，NUMA node 的处理器和内存块的物理距离更小，因此访问也更快。比如这台机器会分左右两个处理器（cpu1、cpu2），在每个处理器两边放两个内存块（memory1.1, memory1.2, memory2.1,memory2.2），这样 NUMA node1 的 cpu1 访问memory1.1 和 memory1.2 就比访问 memory2.1 和 memory2.2 更快。所以使用 NUMA 的模式如果能尽量保证本 node 内的 CPU 只访问本 node 内的内存块，那这样的效率就是最高的
Get Free Page (GFP) Flags 的取值参考：Physical Page Allocation
需要分配的物理页个数为：2 的 order 次方
alloc_pages_node 的返回值：如果分配成功，是第一个页的指针；如果分配失败，是空指针 NULL

1.1.2 alloc_pages

1 2	#define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order)

alloc_pages 是宏定义，逻辑是调用 alloc_pages_node 函数，传递给 nid 参数的值是调用 numa_node_id() 函数获取到的当前 CPU 所处的 NUMA node 的 ID，需要用户传递参数 GFP flags 和需要分配的页个数

1.1.3 alloc_page

1	#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

alloc_page 是宏定义，逻辑是调用 alloc_pages，传递给 order 参数的值为 0，表示需要分配的物理页个数为 2 的 0 次方，即 1 个物理页，需要用户传递参数 GFP flags

1.1.4 free_pages

// include/linux/gfp.h
void free_pages(unsigned long addr, unsigned int order)
{
        if (addr != 0) {
                VM_BUG_ON(!virt_addr_valid((void *)addr));
                __free_pages(virt_to_page((void *)addr), order);
        }
}

// mm/page_alloc.c
void __free_pages(struct page *page, unsigned int order)
{
        if (put_page_testzero(page)) {
                if (order == 0)
                        free_hot_cold_page(page, false);
                else
                        __free_pages_ok(page, order);
        }
}

参数	含义
addr	要释放的连续物理页中第一个的地址
order	被释放的物理页个数为 2 的 order 次方

2 Slab

2.1 kmem_cache

2.1.1 kmem_cache_create

kmem_cache_t *kmem_cache_create(const char *name, size_t size,
                                size_t offset, 
                                unsigned long flags,
                                void (*constructor)(void *, kmem_cache_t *,
                                                    unsigned long flags),
                                void (*destructor)(void *, kmem_cache_t *,
                                                   unsigned long flags));

参数	含义
name	被缓存对象的类名
size	每个对象的大小
offset	第一个对象在物理页中的偏移量，通常置为 0
flags	决定内存如何被分配、管理
constructor	构造函数
destructor	析构函数

flags 参数是 bitmap，决定了内存如何被分配、管理，

flag	作用
SLAB_NO_REAP	保护缓存在系统查找内存时不被削减，不推荐
SLAB_HWCACHE_ALIGN	所有数据对象跟高速缓存行对齐，平台依赖，可能浪费内存
SLAB_CACHE_DMA	每个数据对象在 DMA 内存区段分配

参数 constructor 和 destructor 是可选函数(不能只有destructor，而没有constructor )，用来初始化新分配的对象和在内存被作为整体释放给系统之前“清理”对象
constructor 函数不保证在为对象被分配内存后立即被调用，同理，destructor 函数不是立刻在一个对象被释放后调用
将 flags 按位与 SLAB_CTOR_ATOMIC，可以确保 constructor 函数和 destructor 函数是原子的，不允许在执行中线程被置于睡眠状态

2.1.2 kmem_cache_alloc

通过调用 kmem_cache_alloc 从已创建的后备高速缓存中分配对象，

1	void kmem_cache_alloc(kmem_cache_t cache, int flags);

参数	含义
cache	调用 kmem_cache_create 函数创建的缓存
flags	GFP（Get Free Page）flags

2.1.3 kmem_cache_free

使用 kmem_cache_free 释放一个对象，

1	void kmem_cache_free(kmem_cache_t cache, const void obj);

2.1.4 kmem_cache_destroy

当用完这个后备高速缓存（通常在当模块被卸载时），释放缓存，

1	int kmem_cache_destroy(kmem_cache_t *cache);

2.2 kmalloc

使用 kmalloc 函数动态分配内存空间，

1	void *kmalloc(size_t size, int flags);

参数	含义
size	要分配内存空间的字节数
flags	GFP（Get Free Page）flags

分配到的内存空间可能会略大于 size 参数的值
flags 参数可取以下值，

flags 取值	含义
GFP_KERNEL	在分配内存空间时，当前进程可被置于睡眠状态
GFP_ATOMIC	在分配内存空间时，当前进程不可被打断
GFP_USER	用于在用户空间分配内存，当前进程可被置于睡眠状态
GFP_HIGHUSER	和 GFP_USER 相似，但是在 high memory 分配内存
GFP_NOIO	和 GFP_KERNEL 相似，但是在内存分配过程中限制 I/O 操作
GFP_NOFS	和 GFP_KERNEL 相似，但是在内存分配过程中限制文件系统调用

更多 flags 的取值可参考 <linux/gfp.h> 头文件

2.3 kzalloc

kzalloc 等价于先调用 kmalloc 分配一块内存空间，然后初始化为0，

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline void *kzalloc(size_t size, gfp_t flags)
{
    return kmalloc(size, flags | __GFP_ZERO);
}

2.4 kfree

分配内存后如果不释放会造成内存泄漏，在内核中可能导致系统崩溃，

1	void kfree(const void *objp);

可以调用 kfree 函数释放动态分配的内存

3 vmalloc/vfree

3.1 vmalloc

1	void *vmalloc(unsigned long size);

使用 vmalloc 函数在虚拟地址空间分配连续的内存空间，这些内存的物理地址并不是连续的

3.2 vfree

1	void vfree(void * addr);

使用 vfree 函数释放由 vmalloc 函数分配的内存空间

伙伴算法以及 Slab 机制

发表于 2021-11-28 分类于学习笔记， linux内核

伙伴算法以及 Slab 机制

1 伙伴系统

1.1 页框

Linux 内核将物理内存划分成固定大小（在 x86 架构下长度为 4KB）的块，被称为页框

1.2 组织结构

Linux 内核将所有的空闲页划分成 11 个页块链表，每个页块链表分别包含 1、2、4、8、16、32、64、128、256、512、1024 个连续页框的页块
每个页块的第一个页的物理地址是该块大小的整数倍
最多可以申请 1024 个连续页，对应 4MB 大小的连续内存

1.3 分配

当向内核请求分配包含 (2^(i-1), 2^i] 个页的页块时，如果对应的页块链表有空闲页块，则分配；如果对应的页块链表没有空闲页块，则在更大的页块链表中找，找到后将这个页块一分为二，它们互为 “伙伴” ，一半分配给进程使用，另一半再根据大小放入合适的页块链表中

1.4 释放

进程使用完 1 个页块后，对其进行释放，如果在这个时候，这个页块的 “伙伴” 也为空闲的，则将它们两个合并，然后放到合适的页块链表中
如果合并后的页块的 “伙伴” 依然是空闲的，依旧将它们两个合并，然后放到合适的页块链表中，这样的过程一直持续，直到找不到空闲的 “伙伴” 页块

2 Slab 机制

2.1 背景

页式管理适合大块内存的情形，而对于内核对象级别的较小内存情形下，不足以占用1个页。在 linux 内核中存在许多小对象，这些对象构造、销毁十分频繁，比如 i-node、dentry 之类的，假如这些对象每次创建的时候向内存要一个页（大小为 4KB），然而实际大小可能只有几个字节，这样就非常浪费，为了解决这个问题就引入了一种新的机制来处理在同一页框中如何分配小存储区的问题

2.2 定义

Slab 是一种内存分配器，通过将内存划分为不同大小的空间分配给对象使用，并管理维护高速缓存

2.3 主要作用

为小对象分配内存空间，减少由 “伙伴” 系统机制导致的内部碎片（页块中未被实际使用的内存空间）问题
内核中一些小对象创建、析构很频繁，Slab对这些小对象做缓存，可以重复利用一些相同的对象，减少内存分配次数

2.4 内存分配和管理

Slab 机制将不同的对象划分为所谓的高速缓存（cache）组，其中每个高速缓存都存放不同类型的对象
每种对象类型对应一个高速缓存（cache）
Slab 由一个或多个物理上连续的页组成，每个高速缓存由多个 slab 组成

2.5 高速缓存

2.5.1 查看当前的 Slab 缓存

Linux 内核为常用对象建立缓存，每种类型对应1个高速缓存。文件 /proc/slabinfo 记载了当前的缓存情况，

$ cat /proc/slabinfo
slabinfo - version: 2.1
# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
ext4_groupinfo_1k     60     60    136   60    2 : tunables    0    0    0 : slabdata      1      1      0
ext4_groupinfo_4k    168    168    144   56    2 : tunables    0    0    0 : slabdata      3      3      0
UDPLITEv6              0      0   1088   30    8 : tunables    0    0    0 : slabdata      0      0      0
UDPv6                 60     60   1088   30    8 : tunables    0    0    0 : slabdata      2      2      0
tw_sock_TCPv6          0      0    280   58    4 : tunables    0    0    0 : slabdata      0      0      0
request_sock_TCPv6      0      0    328   49    4 : tunables    0    0    0 : slabdata      0      0      0
TCPv6                 30     30   2112   15    8 : tunables    0    0    0 : slabdata      2      2      0
kcopyd_job             0      0   3312    9    8 : tunables    0    0    0 : slabdata      0      0      0
dm_uevent              0      0   2632   12    8 : tunables    0    0    0 : slabdata      0      0      0
cfq_queue              0      0    232   70    4 : tunables    0    0    0 : slabdata      0      0      0
............

各字段的含义（参考：slabinfo(5) - Linux manual page (man7.org)），

字段	含义
name	对象的类名
active_objs	当前活跃（使用中）的对象个数
num_objs	所有的对象个数，包括使用中和未使用的对象
objsize	每个对象的大小
objperslab	每个 slab 存放对象的个数
pagesperslab	每个 slab 被分配页的个数

2.5.2 划分

为了加速小对象的分配和释放，处于同一个高速缓存中的 slab 被划分到三个链表中：slabs_full、slabs_partial 、slabs_free
slabs_full 链表所有 slab 中的对象都处于使用中的状态；slabs_partial 链表部分 slab 有当前未被使用的对象，可以分配给进程；slabs_free 链表不包含分配的对象，主要用于 slab 的销毁

Linux 内核同步机制之 Semaphore

发表于 2021-11-22 分类于学习笔记， linux内核

Linux 内核同步机制之 Semaphore

1 Semaphore 的使用

1.1 竞争条件

竞争条件是指多个线程或者进程在读写一个共享数据时结果依赖于它们执行的相对时间的情形
举例：

int open_count = 0;

int hello_open(struct inode *p, struct file *f) {
    if (open_count) {
        return -EBUSY;
    }
    ++open_count;
    printk(KERN_INFO "hello_open ok\r\n");
    return 0;
}

上面代码的作用是确保同一时间只有1个进程能够访问 hello_dev 驱动设备，但是由于 CPU 的指令调度，可能存在如下的执行顺序，

初始情况，open_count 为 0，进程 A 执行到 if 语句，条件为假，不执行分支内部的语句，此时，由于 CPU 调度（可能是因为进程 A 时间片到期或者被更高优先级的进程 B 抢占），进程 B 也执行到 if 判断，条件也为假，不执行分支内部的语句，然后进程 A 和 B 都获得了设备的使用权

1.2 使用 Semaphore 实现互斥

1.2.1 声明

1	struct semaphore sema;

在 linux 内核中，信号量为结构体类型 struct semaphore，在 hello_dev 字符设备驱动中，将信号量声明成 struct semaphore 类型的全局变量 sema

1.2.2 初始化

信号量在使用之前需要初始化，

int hello_init(void) {
    devNum = MKDEV(reg_major, reg_minor);
    if(OK == register_chrdev_region(devNum, subDevNum, "helloworld")) {
        printk(KERN_INFO "register_chrdev_region ok \n"); 
    } else {
    printk(KERN_INFO "register_chrdev_region error n");
        return ERROR;
    }
    printk(KERN_INFO " hello driver init \n");
    gDev = kzalloc(sizeof(struct cdev), GFP_KERNEL);
    gFile = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
    gFile->open = hello_open;
    gFile->read = hello_read;
    gFile->write = hello_write;
    gFile->owner = THIS_MODULE;
    cdev_init(gDev, gFile);
    cdev_add(gDev, devNum, 1);
    
    sema_init(&sema, 1);
    return 0;
}

在 hello_dev 字符设备的初始化函数中，调用 sema_init 函数将信号量 sema 初始化为 1，

static inline void sema_init(struct semaphore *sem, int val)
{
    static struct lock_class_key __key;
    *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
    lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
}

将信号量初始化为 1，是因为 hello_dev 字符设备需要保证互斥访问，即同一时间只有1个进程或线程能够访问这个设备

1.2.3 使用信号量解决上述的竞争条件问题

int hello_open(struct inode *p, struct file *f) {
    down(&sema);
    if (open_count >= 1) {
        printk(KERN_INFO "device is busy, hello open failed");
        return -EBUSY;
    }
    ++open_count;
    up(&sema);
    printk(KERN_INFO "hello_open ok\r\n");
    return 0;
}

之前的 hello_open 函数代码之所以存在竞争条件问题，是因为打开设备的过程被分为两个步骤：判断使用量、增加使用量，不具有原子性
将判断、增加使用量的代码称为临界区，在进入临界区之前，先调用 down 函数，将信号量 sema 的值减 1；离开临界区时，调用 up 函数，对信号量 sema 的值加 1

1.2.4 原理浅析

1.2.4.1 基本原理

当对某个信号量调用 down 函数，会对信号量的值减 1，如果得到的结果不小于 0，则可以继续执行临界区代码；如果得到的结果小于 0，则会导致进程被阻塞休眠
当对某个信号量调用 up 函数，会对信号量的值加 1，如果得到的结果不大于0，就会从进程阻塞队列中根据某种策略选取 1 个进程唤出，执行临界区代码

1.2.4.2 案例分析

在初始情况下，信号量 sema 被初始化为 1，此时进程 A 调用 down 函数，将信号量的值减 1，得到的结果为 0，可以继续执行临界区代码
在进程 A 调用 up 函数前，进程 B 也想使用同一设备，于是调用 down 函数，对信号量 sema 的值减 1，得到结果为 -1，从而被阻塞
当进程 A 执行完临界区代码后，调用 up 函数，将信号量 sema 的值加 1，得到结果为 0，于是将进程 B 从进程阻塞队列唤出，于是 B 也能执行临界区代码
进程 B 执行完临界区代码后，也调用 up 函数，将信号量 sema 的值加 1，又回到初始状态 1

2 Semaphore 源码浅析

2.1 数据结构

/* Please don't access any members of this structure directly */
struct semaphore {
    raw_spinlock_t      lock;
    unsigned int        count;
    struct list_head    wait_list;
};

在 semaphore 结构体内部有 3 个成员变量，分别是：自旋锁 lock、计数器 count、双向链表 wait_list
在 linux 内核中，已经实现了双向链表，类型为 struct list_head，无需再自己实现

2.2 初始化信号量的过程

2.2.1 宏初始化

#define __SEMAPHORE_INITIALIZER(name, n)                \
{                                   \
    .lock       = __RAW_SPIN_LOCK_UNLOCKED((name).lock),    \
    .count      = n,                        \
    .wait_list  = LIST_HEAD_INIT((name).wait_list),     \
}

以宏定义的形式初始化信号量结构体内部的 3 个成员：锁、计数器和双向链表

2.2.2 函数初始化

static inline void sema_init(struct semaphore *sem, int val)
{
    static struct lock_class_key __key;
    *sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
    lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
}

2.3 获取信号量的过程

2.3.1 down 函数

/**
 * down - acquire the semaphore
 * @sem: the semaphore to be acquired
 *
 * Acquires the semaphore.  If no more tasks are allowed to acquire the
 * semaphore, calling this function will put the task to sleep until the
 * semaphore is released.
 *
 * Use of this function is deprecated, please use down_interruptible() or
 * down_killable() instead.
 */
void down(struct semaphore *sem)
{
    unsigned long flags;

    raw_spin_lock_irqsave(&sem->lock, flags);
    if (likely(sem->count > 0))
        sem->count--;
    else
        __down(sem);
    raw_spin_unlock_irqrestore(&sem->lock, flags);
}

因为对信号量 sem 内部成员的访问可能是并发的，所以使用自旋锁 sem->lock 确保共享变量的互斥访问
如果此时计数器 sem->count 大于 0，则将其减 1，然后释放自旋锁 sem-> lock，退出 down 函数
如果此时计数器 sem->count 小于或等于 0，则调用 __down 函数

2.3.2 __down 函数

static noinline void __sched __down(struct semaphore *sem)
{
    __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}

down 函数内部又调用 __down_common 函数
宏 TASK_UNINTERRUPTIBLE 表示进程不可中断的睡眠状态，宏 MAX_SCHEDULE_TIMEOUT 被定义成 LONG_MAX

2.3.3 __down_common 函数

static inline int __sched __down_common(struct semaphore *sem, long state, long timeout)
{
    struct task_struct *task = current;
    struct semaphore_waiter waiter;

    list_add_tail(&waiter.list, &sem->wait_list);
    waiter.task = task;
    waiter.up = false;

    for (;;) {
        if (signal_pending_state(state, task))
            goto interrupted; //因中断信号而结束
        if (unlikely(timeout <= 0))
            goto timed_out; // //因超时而结束
        __set_task_state(task, state);
        raw_spin_unlock_irq(&sem->lock);
        timeout = schedule_timeout(timeout);
        raw_spin_lock_irq(&sem->lock);
        if (waiter.up)
            return 0;
    }

 timed_out:
    list_del(&waiter.list);
    return -ETIME;

 interrupted:
    list_del(&waiter.list);
    return -EINTR;
}

在 linux 内核源码中，task_struct 类型是进程的抽象，current 指针始终指向当前正在执行中的进程，在 __down_common 函数的第一行定义 task_struct 类型指针 task 指向当前进程
将当前进程加入到信号量 sem 的等待队列 wait_list 中，这里用双向链表模拟队列，添加元素到表尾即为入队，从表头取出元素即为出队
进入 for 循环内部，signal_pending_state 函数判断当前是否存在中断信号，如果有就跳出死循环，将进程从阻塞队列中移除，__down_common 函数返回 -EINTR
如果 timeout 小于等于 0，则因为超时跳出死循环，将进程从阻塞队列中移除，__down_common 函数返回 -ETIME
schedule_timeout 函数将当前进程置于休眠状态，等待 timeout 时间后继续执行
如果其他进程对这个信号量调用了 up 函数，那么 waiter.up 就置为 1，__down_common 函数返回 0

2.4 释放信号量的过程

2.4.1 up 函数

/**
 * up - release the semaphore
 * @sem: the semaphore to release
 *
 * Release the semaphore.  Unlike mutexes, up() may be called from any
 * context and even by tasks which have never called down().
 */
void up(struct semaphore *sem)
{
    unsigned long flags;

    raw_spin_lock_irqsave(&sem->lock, flags);
    if (likely(list_empty(&sem->wait_list)))
        sem->count++;
    else
        __up(sem);
    raw_spin_unlock_irqrestore(&sem->lock, flags);
}

因为对信号量 sem 内部成员的访问可能是并发的，所以使用自旋锁 sem->lock 确保共享变量的互斥访问
如果此时等待队列是空的，则将 count 加 1，然后释放自旋锁 sem-> lock，退出 up 函数
如果此时等待队列不为空，调用 __up 函数

2.4.2 __up 函数

static noinline void __sched __up(struct semaphore *sem)
{
    struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
                        struct semaphore_waiter, list);
    list_del(&waiter->list);
    waiter->up = true;
    wake_up_process(waiter->task);
}

取出信号量 sem 等待队列的第一个进程，调用 wake_up_process 函数唤醒这个处于睡眠状态的进程，使其由睡眠状态变为RUNNING状态，从而能够被CPU重新调度执行

x86 段页式内存管理和页表映射

发表于 2021-11-21 分类于学习笔记， linux内核

x86 段页式内存管理和页表映射

1 逻辑地址到物理地址的转换

1.1 转换规则

linux 内核的地址转换规则是：从逻辑地址到线性地址，再从线性地址通过页表映射转换为物理地址
从逻辑地址到线性地址的转换规则是：通过段选择器找到对应的段（比如数据段、代码段等），加上段内偏移量定位到线性地址。在 linux 系统中，段基址定义为 0，所以实际上，线性地址等于逻辑地址
从线性地址到物理地址的转换则是通过页表映射实现

1.2 进程隔离

两个不同的进程中，使用相同的逻辑地址，可以访问到不同的物理地址。这是因为不同的进程可以对应不同的页目录表，从而最终定位到的物理地址也就不同

2 多级页表（以两级为例）

2.1 线性地址的划分

在 32 位系统中，线性地址为 32 位，被划分成 3 段，分别是：页目录表索引（10位）、页表索引（10位）、页内偏移量（12位），以此为例，实际情况可能不一样

2.2 定位页目录表

在CR3寄存器中，存放着页目录表的起始地址

2.3 定位页表

根据公式：页目录表的起始地址 + 页目录表项索引 * 4B 得到某个页目录表项，其中存放着页表的起始地址（这里 4B 是一个页目录表项的大小，视具体情况而定）

2.4 定位物理地址

根据公式：页表基址 + 页表项索引 * 4B 得到某个页表项，其中存放着物理地址的基址（这里 4B 是一个页表项的大小，视具体情况而定）
根据公式：物理地址基址 + 页内偏移量，从而得到逻辑地址对应的实际物理地址

内核空间和用户空间的数据拷贝

发表于 2021-11-21 分类于学习笔记， linux内核

内核空间和用户空间的数据拷贝

1 内核空间和用户空间

1.1 地址空间的划分

32位地址总线的寻址范围是4GB，默认情况下，在32位系统中，处于低地址的3GB地址被分配给用户空间，处于高地址的1GB地址被分配给内核空间
在 arch/x86/include/asm/page_32_types.h 中有如下定义，

/*
 * This handles the memory map.
 *
 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
 * a virtual address space of one gigabyte, which limits the
 * amount of physical memory you can use to about 950MB.
 *
 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
 * and CONFIG_HIGHMEM64G options in the kernel configuration.
 */
#define __PAGE_OFFSET_BASE  _AC(CONFIG_PAGE_OFFSET, UL)
#define __PAGE_OFFSET       __PAGE_OFFSET_BASE

1.2 用户空间和内核空间的区别

1.2.1 权限不同

对于 x86 体系的 CPU，用户空间代码运行在 Ring3 模式，内核空间代码运行在 Ring0 模式
对于 arm 体系的 CPU，用户空间代码运行在 usr 模式，内核空间代码运行在 svc 模式

1.2.2 安全考量

整个系统中有各种资源，比如计算资源、内存资源和外设资源，而 linux 是多用户、多进程系统，所以这些资源必须在受限的、被管理的状态下使用，否则就会陷入混乱。空间隔离可以保证即使是单个应用程序出现错误也不会影响到操作系统的稳定性

1.2.3 侧重点

内核代码偏重于系统管理；而用户空间代码（也即应用程序）偏重于业务逻辑的实现

1.3 应用程序使用内核提供的服务

CPU、内存、外设、文件等资源是受保护的，用户进程无法直接操作，必须以系统调用的形式访问
当应用程序要使用内核提供的服务时，要通过 “访管” 指令陷入到内核态，触发软中断，执行对应的系统调用
当处于进程上下文的程序在运行时，如果发生定时器中断、外设中断，当前进程会被打断，CPU 进入内核态去执行对应的中断处理程序，处理结束后再回到用户进程

2 数据拷贝

2.1 用户空间向内核空间拷贝数据

ssize_t hello_write(struct file *f, const char __user *u, size_t s, loff_t *l) {
    printk(KERN_INFO "hello_write\r\n");
    int write_len = BUFFER_MAX > s ? s : BUFFER_MAX;
    if (copy_from_user(buffer, u, write_len)) {
        return -EFAULT;
    }
    return write_len;
}

指针 u 用 __user 宏修饰，表示这是用户空间指针，内核不能直接使用
在内核空间定义缓冲区 buffer，存放 hello_dev 字符设备的数据
write_len 取缓冲区容量 BUFFER_MAX 和字符长度 s 的最小值，防止越界
调用 copy_from_user 函数执行用户空间到内核空间的数据拷贝，传入内核空间地址、用户空间地址、数据长度，成功返回零，失败返回非零值

2.2 内核空间向用户空间拷贝数据

ssize_t hello_read(struct file *f, char __user *u, size_t s, loff_t *l) {
    printk(KERN_INFO "hello_read\r\n");   
    int read_len = BUFFER_MAX > s ? s : BUFFER_MAX;
    if (copy_to_user(u, buffer, read_len)) {
        return -EFAULT;
    }   
    return read_len;
}

调用 copy_to_user 函数执行内核空间到用户空间的数据拷贝，传入用户空间地址、内核空间地址、数据长度，成功返回零，失败返回非零值

2.3 测试

2.3.1 创建字符设备

重新编译内核，

$ make
............
Kernel: arch/x86/boot/bzImage is ready  (#3)
Building modules, stage 2.
MODPOST 19 modules
LD [M]  drivers/char/hello_dev.ko

加载 hello_dev 驱动模块，

1	$ sudo insmod hello_dev.ko

创建 hello_dev 字符设备，

1	$ sudo mknod /dev/hello c 232 0

添加访问权限，

1	$ sudo chmod a+w /dev/hello

2.3.2 编译、运行测试程序

测试程序，

#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/select.h>

#define DATA_LEN (32)

int main(int argc, char *argv[]) {
    char buf[DATA_LEN] = "Hello, world!";
    int fd = open("/dev/hello", O_RDWR);
    if (-1 == fd) {
        perror("open file error\r\n");
        return -1;
    } 
    printf("open success\r\n");

    int w_len = write(fd, buf, DATA_LEN);
    memset(buf, 0, DATA_LEN);
    int r_len = read(fd, buf, DATA_LEN);
    printf("%d %d\r\n", w_len, r_len);
    printf("%s\r\n", buf);

    return 0;
}

编译，

1	$ gcc hello_dev_test.c -o hello_dev_test

运行，

$ ./hello_dev_test 
open success
32 32
Hello, world!

字符设备驱动代码添加到内核源码树

发表于 2021-11-13 分类于学习笔记， linux内核

字符设备驱动代码添加到内核源码树

1 静态加载

1.1 移动驱动设备源码文件

1	$ cp hello_dev.c ~/workspace/c/src/linux-4.9.229/drivers/char/

将字符设备驱动源码文件拷贝到 linux 内核源码的 drivers/char 目录下

1.2 修改 Kconfig 文件

在 drivers/char 目录下的 Kconfig 文件中添加如下几行，

config HELLO_DEV
    tristate "hello device"
    default y
    help
      hello device

config 关键字后面跟着的是在 make menuconfig 中的配置项名称
关于 tristate ，前缀 tri- 表示“三”，state 表示“状态”，也就是说这个模块可以有三种状态：y(编译进内核)、m(作为独立模块编译)、n(不被编译)
如果是 bool 则说明这个模块只有两种状态：y(编译进内核)、n(不被编译)
default 后面跟着模块的默认状态，当前是 y(编译进内核)
help 后面的内容是此模块的帮助信息

1.3 查看修改后的配置

执行 make menuconfig 命令可视化查看配置信息，

1	$ make menuconfig

选择 Device Drivers 进入，

选择 Character devices 进入，

字符设备驱动模块 hello device 确实出现在列表中，左侧的 “*” 表示 built-in，即将此模块编译到内核中，如果是 “M” 表示作为独立模块编译，“[]” 表示不编译此模块

选择 Help 进入，

可以看到 hello_dev 模块的帮助信息

1.4 修改 Makefile 文件

在 drivers/char 目录下的 Makefile 文件添加以下内容，

1	obj-$(CONFIG_HELLO_DEV) += hello_dev.o

1.5 编译 linux 内核

在 linux 内核源码根目录下执行 make 命令，

1	$ make -j4

2 动态加载

2.1 静态加载、动态加载的区别

静态加载：驱动源码被编译到内核里面，随着内核启动，不需要手动通过 insmod 命令加载
动态加载：驱动作为独立模块编译，生成 .ko 文件，需要在内核启动后，手动或者通过脚本的形式执行 insmod 命令加载设备驱动模块

2.2 修改编译配置文件

执行 make menuconfig，

1	$ make menuconfig

依次选择 Device Drivers -> Character devices 进入，

配置 hello device 驱动作为独立模块编译，

2.3 编译 linux 内核

在 linux 内核源码根目录下执行 make 命令，

1	$ make -j4

2.4 查看编译产物

切换到 hello_dev 驱动源码所在目录，

1	$ cd drivers/char/

确认是否存在对应的 .ko 文件，

1 2	$ ll hello_dev.ko -rw-rw-r-- 1 lnhoo lnhoo 5535 Nov 13 20:51 hello_dev.ko

linux 内核源码编译过程

发表于 2021-11-07 分类于学习笔记， linux内核

linux 内核源码编译过程

1 源码根目录 Makefile

1.1 编译流程

先执行各子目录的 Makefile，生成各自的编译产物 built-in.o 文件，最后再由链接器将二进制文件链接成完整的内核镜像文件 vmlinux

1.2 体系结构相关代码编译

1
2
3

SRCARCH     := $(ARCH)
............
include arch/$(SRCARCH)/Makefile

ARCH 变量是在编译内核源码前通过 export 命令指定的，比如

1	$ export ARCH=x86

源码根目录 Makefile 文件包含了体系结构相关目录的 Makefile 文件

1.3 子模块相关代码编译

ifeq ($(KBUILD_EXTMOD),)
core-y      += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/

vmlinux-dirs    := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
             $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
             $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))

vmlinux-alldirs := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
             $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))

init-y      := $(patsubst %/, %/built-in.o, $(init-y))
core-y      := $(patsubst %/, %/built-in.o, $(core-y))
drivers-y   := $(patsubst %/, %/built-in.o, $(drivers-y))
net-y       := $(patsubst %/, %/built-in.o, $(net-y))
libs-y1     := $(patsubst %/, %/lib.a, $(libs-y))
libs-y2     := $(patsubst %/, %/built-in.o, $(libs-y))
libs-y      := $(libs-y1) $(libs-y2)
virt-y      := $(patsubst %/, %/built-in.o, $(virt-y))

各子模块执行对应目录下的 Makefile 文件，生成各自的 built-in.o 文件，最后再根据源码根目录下的 Makefile 文件链接成完整的内核镜像文件 vmlinux

2 子目录 (以 drivers/tty 目录为例)

2.1 Kconfig 文件

Kconfig 文件存放编译的配置信息，决定着哪些 .c 文件要被编译，是编译到内核中，还是以独立模块的形式编译
配置 n 表示不编译，y 表示编译到内核中，m 表示以独立模块的形式编译
执行 make memuconfig 命令，可以在命令行可视化查看、编辑 Kconfig 文件中的配置

2.2 Makefile 文件

截取其中的1个片段，

obj-y               += irqchip/
obj-y               += bus/

obj-$(CONFIG_GENERIC_PHY)   += phy/

# GPIO must come after pinctrl as gpios may need to mux pins etc
obj-$(CONFIG_PINCTRL)       += pinctrl/
obj-$(CONFIG_GPIOLIB)       += gpio/
obj-y               += pwm/
obj-$(CONFIG_PCI)       += pci/
obj-$(CONFIG_PARISC)        += parisc/
obj-$(CONFIG_RAPIDIO)       += rapidio/
obj-y               += video/
obj-y               += idle/
............

obj-y 表示将指定目录下的代码编译到内核
obj-$(变量) 表示根据 Kconfig 文件中的配置决定是否将该目录下的源码编译到内核中，这个变量会在 Kconfig 文件中设置

应用层的 write 如何调用驱动里的 write

发表于 2021-11-07 分类于学习笔记， linux内核

应用层的 write 如何调用驱动里的 write

1 编写应用程序测试 hello_dev 驱动

1.1 编写测试程序

#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/select.h>

#define DATA_LEN (64)

int main(int argc, char *argv[]) {
    char buf[DATA_LEN] = "Hello, world!";
    int fd = open("/dev/hello", O_RDWR);
    if (-1 == fd) {
        perror("open file error\r\n");
        return -1;
    } 
    printf("open success\r\n");

    int w_len = write(fd, buf, DATA_LEN);
    memset(buf, 0, DATA_LEN);
    int r_len = read(fd, buf, DATA_LEN);
    printf("%d %d\r\n", w_len, r_len);
    printf("%s\r\n", buf);

    return 0;
}

打开 “/dev/hello” 设备，写入数据，然后再读取，控制台输出write、read函数的返回值、缓冲区中的内容

1.2 创建字符设备

执行 insmod 命令加载驱动到内核，

1	$ sudo insmod hello_dev.ko

使用 mknod 命令创建设备，

MKNOD(1)                                                              User Commands                                                             MKNOD(1)
NAME
       mknod - make block or character special files
SYNOPSIS
       mknod [OPTION]... NAME TYPE [MAJOR MINOR]
DESCRIPTION
       Create the special file NAME of the given TYPE.

参数	说明
NAME	设备名称
TYPE	设备类型，b表示块设备，c表示字符设备，p表示FIFO
MAJOR	主设备号
MINOR	次设备号

hello_dev 驱动文件中定义主设备号为 232，次设备号为 0，于是创建字符设备，

1	sudo mknod /dev/hello c 232 0

此时执行 ls -l 命令查看设备是否存在，

1 2	$ ls -l /dev/hello crw-r--r-- 1 root root 232, 0 Nov 6 23:52 /dev/hello

1.2 编译、运行测试程序

使用 gcc 编译测试程序，

1	$ gcc -o hello_dev_test hello_dev_test.c

运行测试程序，查看结果，

1
2
3

$ ./hello_dev_test 
open success
0 0

这里 w_len、r_len 为 0，缓冲区为空，因为 hello_dev 驱动源码中，write、read 函数输出内核日志后就直接返回 0 了

查看linux内核日志，

$ dmesg | tail -3
[ 9040.652155] hello_open
[ 9040.652186] hello_write
[ 9040.652188] hello_read

hello_dev 驱动程序中定义的 open、write、read 函数确实被调用了

2 从应用层 write 调用到驱动程序中定义的write

不支持在 Docs 外粘贴 block

应用进程调用 c 函数库中定义的 write
c write 函数接着陷入系统调用，

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count)
{
    struct fd f = fdget_pos(fd);
    ssize_t ret = -EBADF;

    if (f.file) {
        loff_t pos = file_pos_read(f.file);
        ret = vfs_write(f.file, buf, count, &pos);
        if (ret >= 0)
            file_pos_write(f.file, pos);
        fdput_pos(f);
    }

    return ret;
}

linux 的系统调用由 SYSCALL_DEFINE 定义

write 系统调用根据文件描述符获取到实际的 file 结构体对象，作为其中1个参数，调用 vfs_write 函数，

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
    ssize_t ret;

    if (!(file->f_mode & FMODE_WRITE))
        return -EBADF;
    if (!(file->f_mode & FMODE_CAN_WRITE))
        return -EINVAL;
    if (unlikely(!access_ok(VERIFY_READ, buf, count)))
        return -EFAULT;

    ret = rw_verify_area(WRITE, file, pos, count);
    if (!ret) {
        if (count > MAX_RW_COUNT)
            count =  MAX_RW_COUNT;
        file_start_write(file);
        ret = __vfs_write(file, buf, count, pos);
        if (ret > 0) {
            fsnotify_modify(file);
            add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
    }

    return ret;
}

vfs_write 函数调用底层的 __vfs_write 函数，

ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, loff_t *pos)
{
    if (file->f_op->write)
        return file->f_op->write(file, p, count, pos);
    else if (file->f_op->write_iter)
        return new_sync_write(file, p, count, pos);
    else
        return -EINVAL;
}

在 __vfs_write 函数中，判断此文件类型是否有定义 write 函数，如果有则直接调用返回结果。file 结构体的 f_op 字段是 file_operations 类型，在 hello_dev 驱动源码中，

devNum = MKDEV(reg_major, reg_minor);
............
gDev = kzalloc(sizeof(struct cdev), GFP_KERNEL);
gFile = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
gFile->open = hello_open;
gFile->read = hello_read;
gFile->write = hello_write;
gFile->owner = THIS_MODULE;
cdev_init(gDev, gFile);
cdev_add(gDev, devNum, 1);